aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-block-bcache156
-rw-r--r--Documentation/bcache.txt431
-rw-r--r--MAINTAINERS7
-rw-r--r--drivers/block/aoe/aoecmd.c6
-rw-r--r--drivers/block/cciss.c22
-rw-r--r--drivers/block/drbd/drbd_actlog.c246
-rw-r--r--drivers/block/drbd/drbd_bitmap.c13
-rw-r--r--drivers/block/drbd/drbd_int.h179
-rw-r--r--drivers/block/drbd/drbd_main.c251
-rw-r--r--drivers/block/drbd/drbd_nl.c200
-rw-r--r--drivers/block/drbd/drbd_proc.c10
-rw-r--r--drivers/block/drbd/drbd_receiver.c16
-rw-r--r--drivers/block/drbd/drbd_req.c192
-rw-r--r--drivers/block/drbd/drbd_req.h8
-rw-r--r--drivers/block/drbd/drbd_state.c28
-rw-r--r--drivers/block/drbd/drbd_strings.c1
-rw-r--r--drivers/block/drbd/drbd_worker.c24
-rw-r--r--drivers/block/mg_disk.c2
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c79
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h11
-rw-r--r--drivers/md/Kconfig2
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/bcache/Kconfig42
-rw-r--r--drivers/md/bcache/Makefile7
-rw-r--r--drivers/md/bcache/alloc.c599
-rw-r--r--drivers/md/bcache/bcache.h1259
-rw-r--r--drivers/md/bcache/bset.c1192
-rw-r--r--drivers/md/bcache/bset.h379
-rw-r--r--drivers/md/bcache/btree.c2503
-rw-r--r--drivers/md/bcache/btree.h405
-rw-r--r--drivers/md/bcache/closure.c345
-rw-r--r--drivers/md/bcache/closure.h672
-rw-r--r--drivers/md/bcache/debug.c565
-rw-r--r--drivers/md/bcache/debug.h54
-rw-r--r--drivers/md/bcache/io.c397
-rw-r--r--drivers/md/bcache/journal.c787
-rw-r--r--drivers/md/bcache/journal.h215
-rw-r--r--drivers/md/bcache/movinggc.c254
-rw-r--r--drivers/md/bcache/request.c1411
-rw-r--r--drivers/md/bcache/request.h62
-rw-r--r--drivers/md/bcache/stats.c246
-rw-r--r--drivers/md/bcache/stats.h58
-rw-r--r--drivers/md/bcache/super.c1987
-rw-r--r--drivers/md/bcache/sysfs.c817
-rw-r--r--drivers/md/bcache/sysfs.h110
-rw-r--r--drivers/md/bcache/trace.c26
-rw-r--r--drivers/md/bcache/util.c377
-rw-r--r--drivers/md/bcache/util.h589
-rw-r--r--drivers/md/bcache/writeback.c414
-rw-r--r--include/linux/cgroup_subsys.h6
-rw-r--r--include/linux/drbd.h5
-rw-r--r--include/linux/drbd_limits.h11
-rw-r--r--include/linux/idr.h10
-rw-r--r--include/linux/lru_cache.h1
-rw-r--r--include/linux/rwsem.h10
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/trace/events/bcache.h271
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/lockdep.c1
-rw-r--r--kernel/rwsem.c16
-rw-r--r--kernel/trace/blktrace.c1
-rw-r--r--lib/lru_cache.c56
62 files changed, 17681 insertions, 372 deletions
diff --git a/Documentation/ABI/testing/sysfs-block-bcache b/Documentation/ABI/testing/sysfs-block-bcache
new file mode 100644
index 000000000000..9e4bbc5d51fd
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-block-bcache
@@ -0,0 +1,156 @@
1What: /sys/block/<disk>/bcache/unregister
2Date: November 2010
3Contact: Kent Overstreet <kent.overstreet@gmail.com>
4Description:
5 A write to this file causes the backing device or cache to be
6 unregistered. If a backing device had dirty data in the cache,
7 writeback mode is automatically disabled and all dirty data is
8 flushed before the device is unregistered. Caches unregister
9 all associated backing devices before unregistering themselves.
10
11What: /sys/block/<disk>/bcache/clear_stats
12Date: November 2010
13Contact: Kent Overstreet <kent.overstreet@gmail.com>
14Description:
15 Writing to this file resets all the statistics for the device.
16
17What: /sys/block/<disk>/bcache/cache
18Date: November 2010
19Contact: Kent Overstreet <kent.overstreet@gmail.com>
20Description:
21 For a backing device that has cache, a symlink to
22 the bcache/ dir of that cache.
23
24What: /sys/block/<disk>/bcache/cache_hits
25Date: November 2010
26Contact: Kent Overstreet <kent.overstreet@gmail.com>
27Description:
28 For backing devices: integer number of full cache hits,
29 counted per bio. A partial cache hit counts as a miss.
30
31What: /sys/block/<disk>/bcache/cache_misses
32Date: November 2010
33Contact: Kent Overstreet <kent.overstreet@gmail.com>
34Description:
35 For backing devices: integer number of cache misses.
36
37What: /sys/block/<disk>/bcache/cache_hit_ratio
38Date: November 2010
39Contact: Kent Overstreet <kent.overstreet@gmail.com>
40Description:
41 For backing devices: cache hits as a percentage.
42
43What: /sys/block/<disk>/bcache/sequential_cutoff
44Date: November 2010
45Contact: Kent Overstreet <kent.overstreet@gmail.com>
46Description:
47 For backing devices: Threshold past which sequential IO will
48 skip the cache. Read and written as bytes in human readable
49 units (i.e. echo 10M > sequntial_cutoff).
50
51What: /sys/block/<disk>/bcache/bypassed
52Date: November 2010
53Contact: Kent Overstreet <kent.overstreet@gmail.com>
54Description:
55 Sum of all reads and writes that have bypassed the cache (due
56 to the sequential cutoff). Expressed as bytes in human
57 readable units.
58
59What: /sys/block/<disk>/bcache/writeback
60Date: November 2010
61Contact: Kent Overstreet <kent.overstreet@gmail.com>
62Description:
63 For backing devices: When on, writeback caching is enabled and
64 writes will be buffered in the cache. When off, caching is in
65 writethrough mode; reads and writes will be added to the
66 cache but no write buffering will take place.
67
68What: /sys/block/<disk>/bcache/writeback_running
69Date: November 2010
70Contact: Kent Overstreet <kent.overstreet@gmail.com>
71Description:
72 For backing devices: when off, dirty data will not be written
73 from the cache to the backing device. The cache will still be
74 used to buffer writes until it is mostly full, at which point
75 writes transparently revert to writethrough mode. Intended only
76 for benchmarking/testing.
77
78What: /sys/block/<disk>/bcache/writeback_delay
79Date: November 2010
80Contact: Kent Overstreet <kent.overstreet@gmail.com>
81Description:
82 For backing devices: In writeback mode, when dirty data is
83 written to the cache and the cache held no dirty data for that
84 backing device, writeback from cache to backing device starts
85 after this delay, expressed as an integer number of seconds.
86
87What: /sys/block/<disk>/bcache/writeback_percent
88Date: November 2010
89Contact: Kent Overstreet <kent.overstreet@gmail.com>
90Description:
91 For backing devices: If nonzero, writeback from cache to
92 backing device only takes place when more than this percentage
93 of the cache is used, allowing more write coalescing to take
94 place and reducing total number of writes sent to the backing
95 device. Integer between 0 and 40.
96
97What: /sys/block/<disk>/bcache/synchronous
98Date: November 2010
99Contact: Kent Overstreet <kent.overstreet@gmail.com>
100Description:
101 For a cache, a boolean that allows synchronous mode to be
102 switched on and off. In synchronous mode all writes are ordered
103 such that the cache can reliably recover from unclean shutdown;
104 if disabled bcache will not generally wait for writes to
105 complete but if the cache is not shut down cleanly all data
106 will be discarded from the cache. Should not be turned off with
107 writeback caching enabled.
108
109What: /sys/block/<disk>/bcache/discard
110Date: November 2010
111Contact: Kent Overstreet <kent.overstreet@gmail.com>
112Description:
113 For a cache, a boolean allowing discard/TRIM to be turned off
114 or back on if the device supports it.
115
116What: /sys/block/<disk>/bcache/bucket_size
117Date: November 2010
118Contact: Kent Overstreet <kent.overstreet@gmail.com>
119Description:
120 For a cache, bucket size in human readable units, as set at
121 cache creation time; should match the erase block size of the
122 SSD for optimal performance.
123
124What: /sys/block/<disk>/bcache/nbuckets
125Date: November 2010
126Contact: Kent Overstreet <kent.overstreet@gmail.com>
127Description:
128 For a cache, the number of usable buckets.
129
130What: /sys/block/<disk>/bcache/tree_depth
131Date: November 2010
132Contact: Kent Overstreet <kent.overstreet@gmail.com>
133Description:
134 For a cache, height of the btree excluding leaf nodes (i.e. a
135 one node tree will have a depth of 0).
136
137What: /sys/block/<disk>/bcache/btree_cache_size
138Date: November 2010
139Contact: Kent Overstreet <kent.overstreet@gmail.com>
140Description:
141 Number of btree buckets/nodes that are currently cached in
142 memory; cache dynamically grows and shrinks in response to
143 memory pressure from the rest of the system.
144
145What: /sys/block/<disk>/bcache/written
146Date: November 2010
147Contact: Kent Overstreet <kent.overstreet@gmail.com>
148Description:
149 For a cache, total amount of data in human readable units
150 written to the cache, excluding all metadata.
151
152What: /sys/block/<disk>/bcache/btree_written
153Date: November 2010
154Contact: Kent Overstreet <kent.overstreet@gmail.com>
155Description:
156 For a cache, sum of all btree writes in human readable units.
diff --git a/Documentation/bcache.txt b/Documentation/bcache.txt
new file mode 100644
index 000000000000..77db8809bd96
--- /dev/null
+++ b/Documentation/bcache.txt
@@ -0,0 +1,431 @@
1Say you've got a big slow raid 6, and an X-25E or three. Wouldn't it be
2nice if you could use them as cache... Hence bcache.
3
4Wiki and git repositories are at:
5 http://bcache.evilpiepirate.org
6 http://evilpiepirate.org/git/linux-bcache.git
7 http://evilpiepirate.org/git/bcache-tools.git
8
9It's designed around the performance characteristics of SSDs - it only allocates
10in erase block sized buckets, and it uses a hybrid btree/log to track cached
11extants (which can be anywhere from a single sector to the bucket size). It's
12designed to avoid random writes at all costs; it fills up an erase block
13sequentially, then issues a discard before reusing it.
14
15Both writethrough and writeback caching are supported. Writeback defaults to
16off, but can be switched on and off arbitrarily at runtime. Bcache goes to
17great lengths to protect your data - it reliably handles unclean shutdown. (It
18doesn't even have a notion of a clean shutdown; bcache simply doesn't return
19writes as completed until they're on stable storage).
20
21Writeback caching can use most of the cache for buffering writes - writing
22dirty data to the backing device is always done sequentially, scanning from the
23start to the end of the index.
24
25Since random IO is what SSDs excel at, there generally won't be much benefit
26to caching large sequential IO. Bcache detects sequential IO and skips it;
27it also keeps a rolling average of the IO sizes per task, and as long as the
28average is above the cutoff it will skip all IO from that task - instead of
29caching the first 512k after every seek. Backups and large file copies should
30thus entirely bypass the cache.
31
32In the event of a data IO error on the flash it will try to recover by reading
33from disk or invalidating cache entries. For unrecoverable errors (meta data
34or dirty data), caching is automatically disabled; if dirty data was present
35in the cache it first disables writeback caching and waits for all dirty data
36to be flushed.
37
38Getting started:
39You'll need make-bcache from the bcache-tools repository. Both the cache device
40and backing device must be formatted before use.
41 make-bcache -B /dev/sdb
42 make-bcache -C /dev/sdc
43
44make-bcache has the ability to format multiple devices at the same time - if
45you format your backing devices and cache device at the same time, you won't
46have to manually attach:
47 make-bcache -B /dev/sda /dev/sdb -C /dev/sdc
48
49To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register:
50
51 echo /dev/sdb > /sys/fs/bcache/register
52 echo /dev/sdc > /sys/fs/bcache/register
53
54To register your bcache devices automatically, you could add something like
55this to an init script:
56
57 echo /dev/sd* > /sys/fs/bcache/register_quiet
58
59It'll look for bcache superblocks and ignore everything that doesn't have one.
60
61Registering the backing device makes the bcache show up in /dev; you can now
62format it and use it as normal. But the first time using a new bcache device,
63it'll be running in passthrough mode until you attach it to a cache. See the
64section on attaching.
65
66The devices show up at /dev/bcacheN, and can be controlled via sysfs from
67/sys/block/bcacheN/bcache:
68
69 mkfs.ext4 /dev/bcache0
70 mount /dev/bcache0 /mnt
71
72Cache devices are managed as sets; multiple caches per set isn't supported yet
73but will allow for mirroring of metadata and dirty data in the future. Your new
74cache set shows up as /sys/fs/bcache/<UUID>
75
76ATTACHING:
77
78After your cache device and backing device are registered, the backing device
79must be attached to your cache set to enable caching. Attaching a backing
80device to a cache set is done thusly, with the UUID of the cache set in
81/sys/fs/bcache:
82
83 echo <UUID> > /sys/block/bcache0/bcache/attach
84
85This only has to be done once. The next time you reboot, just reregister all
86your bcache devices. If a backing device has data in a cache somewhere, the
87/dev/bcache# device won't be created until the cache shows up - particularly
88important if you have writeback caching turned on.
89
90If you're booting up and your cache device is gone and never coming back, you
91can force run the backing device:
92
93 echo 1 > /sys/block/sdb/bcache/running
94
95(You need to use /sys/block/sdb (or whatever your backing device is called), not
96/sys/block/bcache0, because bcache0 doesn't exist yet. If you're using a
97partition, the bcache directory would be at /sys/block/sdb/sdb2/bcache)
98
99The backing device will still use that cache set if it shows up in the future,
100but all the cached data will be invalidated. If there was dirty data in the
101cache, don't expect the filesystem to be recoverable - you will have massive
102filesystem corruption, though ext4's fsck does work miracles.
103
104ERROR HANDLING:
105
106Bcache tries to transparently handle IO errors to/from the cache device without
107affecting normal operation; if it sees too many errors (the threshold is
108configurable, and defaults to 0) it shuts down the cache device and switches all
109the backing devices to passthrough mode.
110
111 - For reads from the cache, if they error we just retry the read from the
112 backing device.
113
114 - For writethrough writes, if the write to the cache errors we just switch to
115 invalidating the data at that lba in the cache (i.e. the same thing we do for
116 a write that bypasses the cache)
117
118 - For writeback writes, we currently pass that error back up to the
119 filesystem/userspace. This could be improved - we could retry it as a write
120 that skips the cache so we don't have to error the write.
121
122 - When we detach, we first try to flush any dirty data (if we were running in
123 writeback mode). It currently doesn't do anything intelligent if it fails to
124 read some of the dirty data, though.
125
126TROUBLESHOOTING PERFORMANCE:
127
128Bcache has a bunch of config options and tunables. The defaults are intended to
129be reasonable for typical desktop and server workloads, but they're not what you
130want for getting the best possible numbers when benchmarking.
131
132 - Bad write performance
133
134 If write performance is not what you expected, you probably wanted to be
135 running in writeback mode, which isn't the default (not due to a lack of
136 maturity, but simply because in writeback mode you'll lose data if something
137 happens to your SSD)
138
139 # echo writeback > /sys/block/bcache0/cache_mode
140
141 - Bad performance, or traffic not going to the SSD that you'd expect
142
143 By default, bcache doesn't cache everything. It tries to skip sequential IO -
144 because you really want to be caching the random IO, and if you copy a 10
145 gigabyte file you probably don't want that pushing 10 gigabytes of randomly
146 accessed data out of your cache.
147
148 But if you want to benchmark reads from cache, and you start out with fio
149 writing an 8 gigabyte test file - so you want to disable that.
150
151 # echo 0 > /sys/block/bcache0/bcache/sequential_cutoff
152
153 To set it back to the default (4 mb), do
154
155 # echo 4M > /sys/block/bcache0/bcache/sequential_cutoff
156
157 - Traffic's still going to the spindle/still getting cache misses
158
159 In the real world, SSDs don't always keep up with disks - particularly with
160 slower SSDs, many disks being cached by one SSD, or mostly sequential IO. So
161 you want to avoid being bottlenecked by the SSD and having it slow everything
162 down.
163
164 To avoid that bcache tracks latency to the cache device, and gradually
165 throttles traffic if the latency exceeds a threshold (it does this by
166 cranking down the sequential bypass).
167
168 You can disable this if you need to by setting the thresholds to 0:
169
170 # echo 0 > /sys/fs/bcache/<cache set>/congested_read_threshold_us
171 # echo 0 > /sys/fs/bcache/<cache set>/congested_write_threshold_us
172
173 The default is 2000 us (2 milliseconds) for reads, and 20000 for writes.
174
175 - Still getting cache misses, of the same data
176
177 One last issue that sometimes trips people up is actually an old bug, due to
178 the way cache coherency is handled for cache misses. If a btree node is full,
179 a cache miss won't be able to insert a key for the new data and the data
180 won't be written to the cache.
181
182 In practice this isn't an issue because as soon as a write comes along it'll
183 cause the btree node to be split, and you need almost no write traffic for
184 this to not show up enough to be noticable (especially since bcache's btree
185 nodes are huge and index large regions of the device). But when you're
186 benchmarking, if you're trying to warm the cache by reading a bunch of data
187 and there's no other traffic - that can be a problem.
188
189 Solution: warm the cache by doing writes, or use the testing branch (there's
190 a fix for the issue there).
191
192SYSFS - BACKING DEVICE:
193
194attach
195 Echo the UUID of a cache set to this file to enable caching.
196
197cache_mode
198 Can be one of either writethrough, writeback, writearound or none.
199
200clear_stats
201 Writing to this file resets the running total stats (not the day/hour/5 minute
202 decaying versions).
203
204detach
205 Write to this file to detach from a cache set. If there is dirty data in the
206 cache, it will be flushed first.
207
208dirty_data
209 Amount of dirty data for this backing device in the cache. Continuously
210 updated unlike the cache set's version, but may be slightly off.
211
212label
213 Name of underlying device.
214
215readahead
216 Size of readahead that should be performed. Defaults to 0. If set to e.g.
217 1M, it will round cache miss reads up to that size, but without overlapping
218 existing cache entries.
219
220running
221 1 if bcache is running (i.e. whether the /dev/bcache device exists, whether
222 it's in passthrough mode or caching).
223
224sequential_cutoff
225 A sequential IO will bypass the cache once it passes this threshhold; the
226 most recent 128 IOs are tracked so sequential IO can be detected even when
227 it isn't all done at once.
228
229sequential_merge
230 If non zero, bcache keeps a list of the last 128 requests submitted to compare
231 against all new requests to determine which new requests are sequential
232 continuations of previous requests for the purpose of determining sequential
233 cutoff. This is necessary if the sequential cutoff value is greater than the
234 maximum acceptable sequential size for any single request.
235
236state
237 The backing device can be in one of four different states:
238
239 no cache: Has never been attached to a cache set.
240
241 clean: Part of a cache set, and there is no cached dirty data.
242
243 dirty: Part of a cache set, and there is cached dirty data.
244
245 inconsistent: The backing device was forcibly run by the user when there was
246 dirty data cached but the cache set was unavailable; whatever data was on the
247 backing device has likely been corrupted.
248
249stop
250 Write to this file to shut down the bcache device and close the backing
251 device.
252
253writeback_delay
254 When dirty data is written to the cache and it previously did not contain
255 any, waits some number of seconds before initiating writeback. Defaults to
256 30.
257
258writeback_percent
259 If nonzero, bcache tries to keep around this percentage of the cache dirty by
260 throttling background writeback and using a PD controller to smoothly adjust
261 the rate.
262
263writeback_rate
264 Rate in sectors per second - if writeback_percent is nonzero, background
265 writeback is throttled to this rate. Continuously adjusted by bcache but may
266 also be set by the user.
267
268writeback_running
269 If off, writeback of dirty data will not take place at all. Dirty data will
270 still be added to the cache until it is mostly full; only meant for
271 benchmarking. Defaults to on.
272
273SYSFS - BACKING DEVICE STATS:
274
275There are directories with these numbers for a running total, as well as
276versions that decay over the past day, hour and 5 minutes; they're also
277aggregated in the cache set directory as well.
278
279bypassed
280 Amount of IO (both reads and writes) that has bypassed the cache
281
282cache_hits
283cache_misses
284cache_hit_ratio
285 Hits and misses are counted per individual IO as bcache sees them; a
286 partial hit is counted as a miss.
287
288cache_bypass_hits
289cache_bypass_misses
290 Hits and misses for IO that is intended to skip the cache are still counted,
291 but broken out here.
292
293cache_miss_collisions
294 Counts instances where data was going to be inserted into the cache from a
295 cache miss, but raced with a write and data was already present (usually 0
296 since the synchronization for cache misses was rewritten)
297
298cache_readaheads
299 Count of times readahead occured.
300
301SYSFS - CACHE SET:
302
303average_key_size
304 Average data per key in the btree.
305
306bdev<0..n>
307 Symlink to each of the attached backing devices.
308
309block_size
310 Block size of the cache devices.
311
312btree_cache_size
313 Amount of memory currently used by the btree cache
314
315bucket_size
316 Size of buckets
317
318cache<0..n>
319 Symlink to each of the cache devices comprising this cache set.
320
321cache_available_percent
322 Percentage of cache device free.
323
324clear_stats
325 Clears the statistics associated with this cache
326
327dirty_data
328 Amount of dirty data is in the cache (updated when garbage collection runs).
329
330flash_vol_create
331 Echoing a size to this file (in human readable units, k/M/G) creates a thinly
332 provisioned volume backed by the cache set.
333
334io_error_halflife
335io_error_limit
336 These determines how many errors we accept before disabling the cache.
337 Each error is decayed by the half life (in # ios). If the decaying count
338 reaches io_error_limit dirty data is written out and the cache is disabled.
339
340journal_delay_ms
341 Journal writes will delay for up to this many milliseconds, unless a cache
342 flush happens sooner. Defaults to 100.
343
344root_usage_percent
345 Percentage of the root btree node in use. If this gets too high the node
346 will split, increasing the tree depth.
347
348stop
349 Write to this file to shut down the cache set - waits until all attached
350 backing devices have been shut down.
351
352tree_depth
353 Depth of the btree (A single node btree has depth 0).
354
355unregister
356 Detaches all backing devices and closes the cache devices; if dirty data is
357 present it will disable writeback caching and wait for it to be flushed.
358
359SYSFS - CACHE SET INTERNAL:
360
361This directory also exposes timings for a number of internal operations, with
362separate files for average duration, average frequency, last occurence and max
363duration: garbage collection, btree read, btree node sorts and btree splits.
364
365active_journal_entries
366 Number of journal entries that are newer than the index.
367
368btree_nodes
369 Total nodes in the btree.
370
371btree_used_percent
372 Average fraction of btree in use.
373
374bset_tree_stats
375 Statistics about the auxiliary search trees
376
377btree_cache_max_chain
378 Longest chain in the btree node cache's hash table
379
380cache_read_races
381 Counts instances where while data was being read from the cache, the bucket
382 was reused and invalidated - i.e. where the pointer was stale after the read
383 completed. When this occurs the data is reread from the backing device.
384
385trigger_gc
386 Writing to this file forces garbage collection to run.
387
388SYSFS - CACHE DEVICE:
389
390block_size
391 Minimum granularity of writes - should match hardware sector size.
392
393btree_written
394 Sum of all btree writes, in (kilo/mega/giga) bytes
395
396bucket_size
397 Size of buckets
398
399cache_replacement_policy
400 One of either lru, fifo or random.
401
402discard
403 Boolean; if on a discard/TRIM will be issued to each bucket before it is
404 reused. Defaults to off, since SATA TRIM is an unqueued command (and thus
405 slow).
406
407freelist_percent
408 Size of the freelist as a percentage of nbuckets. Can be written to to
409 increase the number of buckets kept on the freelist, which lets you
410 artificially reduce the size of the cache at runtime. Mostly for testing
411 purposes (i.e. testing how different size caches affect your hit rate), but
412 since buckets are discarded when they move on to the freelist will also make
413 the SSD's garbage collection easier by effectively giving it more reserved
414 space.
415
416io_errors
417 Number of errors that have occured, decayed by io_error_halflife.
418
419metadata_written
420 Sum of all non data writes (btree writes and all other metadata).
421
422nbuckets
423 Total buckets in this cache
424
425priority_stats
426 Statistics about how recently data in the cache has been accessed. This can
427 reveal your working set size.
428
429written
430 Sum of all data that has been written to the cache; comparison with
431 btree_written gives the amount of write inflation in bcache.
diff --git a/MAINTAINERS b/MAINTAINERS
index e73c374483cb..5f5c895e6621 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1620,6 +1620,13 @@ W: http://www.baycom.org/~tom/ham/ham.html
1620S: Maintained 1620S: Maintained
1621F: drivers/net/hamradio/baycom* 1621F: drivers/net/hamradio/baycom*
1622 1622
1623BCACHE (BLOCK LAYER CACHE)
1624M: Kent Overstreet <koverstreet@google.com>
1625L: linux-bcache@vger.kernel.org
1626W: http://bcache.evilpiepirate.org
1627S: Maintained:
1628F: drivers/md/bcache/
1629
1623BEFS FILE SYSTEM 1630BEFS FILE SYSTEM
1624S: Orphan 1631S: Orphan
1625F: Documentation/filesystems/befs.txt 1632F: Documentation/filesystems/befs.txt
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 5efed089a702..fc803ecbbce4 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -920,16 +920,14 @@ bio_pagedec(struct bio *bio)
920static void 920static void
921bufinit(struct buf *buf, struct request *rq, struct bio *bio) 921bufinit(struct buf *buf, struct request *rq, struct bio *bio)
922{ 922{
923 struct bio_vec *bv;
924
925 memset(buf, 0, sizeof(*buf)); 923 memset(buf, 0, sizeof(*buf));
926 buf->rq = rq; 924 buf->rq = rq;
927 buf->bio = bio; 925 buf->bio = bio;
928 buf->resid = bio->bi_size; 926 buf->resid = bio->bi_size;
929 buf->sector = bio->bi_sector; 927 buf->sector = bio->bi_sector;
930 bio_pageinc(bio); 928 bio_pageinc(bio);
931 buf->bv = bv = bio_iovec(bio); 929 buf->bv = bio_iovec(bio);
932 buf->bv_resid = bv->bv_len; 930 buf->bv_resid = buf->bv->bv_len;
933 WARN_ON(buf->bv_resid == 0); 931 WARN_ON(buf->bv_resid == 0);
934} 932}
935 933
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 94b51c5e0678..6374dc103521 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -75,6 +75,12 @@ module_param(cciss_simple_mode, int, S_IRUGO|S_IWUSR);
75MODULE_PARM_DESC(cciss_simple_mode, 75MODULE_PARM_DESC(cciss_simple_mode,
76 "Use 'simple mode' rather than 'performant mode'"); 76 "Use 'simple mode' rather than 'performant mode'");
77 77
78static int cciss_allow_hpsa;
79module_param(cciss_allow_hpsa, int, S_IRUGO|S_IWUSR);
80MODULE_PARM_DESC(cciss_allow_hpsa,
81 "Prevent cciss driver from accessing hardware known to be "
82 " supported by the hpsa driver");
83
78static DEFINE_MUTEX(cciss_mutex); 84static DEFINE_MUTEX(cciss_mutex);
79static struct proc_dir_entry *proc_cciss; 85static struct proc_dir_entry *proc_cciss;
80 86
@@ -4115,9 +4121,13 @@ static int cciss_lookup_board_id(struct pci_dev *pdev, u32 *board_id)
4115 *board_id = ((subsystem_device_id << 16) & 0xffff0000) | 4121 *board_id = ((subsystem_device_id << 16) & 0xffff0000) |
4116 subsystem_vendor_id; 4122 subsystem_vendor_id;
4117 4123
4118 for (i = 0; i < ARRAY_SIZE(products); i++) 4124 for (i = 0; i < ARRAY_SIZE(products); i++) {
4125 /* Stand aside for hpsa driver on request */
4126 if (cciss_allow_hpsa)
4127 return -ENODEV;
4119 if (*board_id == products[i].board_id) 4128 if (*board_id == products[i].board_id)
4120 return i; 4129 return i;
4130 }
4121 dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n", 4131 dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n",
4122 *board_id); 4132 *board_id);
4123 return -ENODEV; 4133 return -ENODEV;
@@ -4959,6 +4969,16 @@ static int cciss_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
4959 ctlr_info_t *h; 4969 ctlr_info_t *h;
4960 unsigned long flags; 4970 unsigned long flags;
4961 4971
4972 /*
4973 * By default the cciss driver is used for all older HP Smart Array
4974 * controllers. There are module paramaters that allow a user to
4975 * override this behavior and instead use the hpsa SCSI driver. If
4976 * this is the case cciss may be loaded first from the kdump initrd
4977 * image and cause a kernel panic. So if reset_devices is true and
4978 * cciss_allow_hpsa is set just bail.
4979 */
4980 if ((reset_devices) && (cciss_allow_hpsa == 1))
4981 return -ENODEV;
4962 rc = cciss_init_reset_devices(pdev); 4982 rc = cciss_init_reset_devices(pdev);
4963 if (rc) { 4983 if (rc) {
4964 if (rc != -ENOTSUPP) 4984 if (rc != -ENOTSUPP)
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 92510f8ad013..6608076dc39e 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -104,7 +104,6 @@ struct update_al_work {
104 int err; 104 int err;
105}; 105};
106 106
107static int al_write_transaction(struct drbd_conf *mdev);
108 107
109void *drbd_md_get_buffer(struct drbd_conf *mdev) 108void *drbd_md_get_buffer(struct drbd_conf *mdev)
110{ 109{
@@ -168,7 +167,11 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
168 bio->bi_end_io = drbd_md_io_complete; 167 bio->bi_end_io = drbd_md_io_complete;
169 bio->bi_rw = rw; 168 bio->bi_rw = rw;
170 169
171 if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ 170 if (!(rw & WRITE) && mdev->state.disk == D_DISKLESS && mdev->ldev == NULL)
171 /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
172 ;
173 else if (!get_ldev_if_state(mdev, D_ATTACHING)) {
174 /* Corresponding put_ldev in drbd_md_io_complete() */
172 dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); 175 dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
173 err = -ENODEV; 176 err = -ENODEV;
174 goto out; 177 goto out;
@@ -199,9 +202,10 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
199 202
200 BUG_ON(!bdev->md_bdev); 203 BUG_ON(!bdev->md_bdev);
201 204
202 dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n", 205 dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
203 current->comm, current->pid, __func__, 206 current->comm, current->pid, __func__,
204 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 207 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
208 (void*)_RET_IP_ );
205 209
206 if (sector < drbd_md_first_sector(bdev) || 210 if (sector < drbd_md_first_sector(bdev) ||
207 sector + 7 > drbd_md_last_sector(bdev)) 211 sector + 7 > drbd_md_last_sector(bdev))
@@ -209,7 +213,8 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
209 current->comm, current->pid, __func__, 213 current->comm, current->pid, __func__,
210 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 214 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
211 215
212 err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE); 216 /* we do all our meta data IO in aligned 4k blocks. */
217 err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096);
213 if (err) { 218 if (err) {
214 dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", 219 dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
215 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); 220 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
@@ -217,44 +222,99 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
217 return err; 222 return err;
218} 223}
219 224
220static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) 225static struct bm_extent *find_active_resync_extent(struct drbd_conf *mdev, unsigned int enr)
221{ 226{
222 struct lc_element *al_ext;
223 struct lc_element *tmp; 227 struct lc_element *tmp;
224 int wake;
225
226 spin_lock_irq(&mdev->al_lock);
227 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); 228 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
228 if (unlikely(tmp != NULL)) { 229 if (unlikely(tmp != NULL)) {
229 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 230 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
230 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { 231 if (test_bit(BME_NO_WRITES, &bm_ext->flags))
231 wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); 232 return bm_ext;
232 spin_unlock_irq(&mdev->al_lock); 233 }
233 if (wake) 234 return NULL;
234 wake_up(&mdev->al_wait); 235}
235 return NULL; 236
236 } 237static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr, bool nonblock)
238{
239 struct lc_element *al_ext;
240 struct bm_extent *bm_ext;
241 int wake;
242
243 spin_lock_irq(&mdev->al_lock);
244 bm_ext = find_active_resync_extent(mdev, enr);
245 if (bm_ext) {
246 wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
247 spin_unlock_irq(&mdev->al_lock);
248 if (wake)
249 wake_up(&mdev->al_wait);
250 return NULL;
237 } 251 }
238 al_ext = lc_get(mdev->act_log, enr); 252 if (nonblock)
253 al_ext = lc_try_get(mdev->act_log, enr);
254 else
255 al_ext = lc_get(mdev->act_log, enr);
239 spin_unlock_irq(&mdev->al_lock); 256 spin_unlock_irq(&mdev->al_lock);
240 return al_ext; 257 return al_ext;
241} 258}
242 259
243void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) 260bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i)
244{ 261{
245 /* for bios crossing activity log extent boundaries, 262 /* for bios crossing activity log extent boundaries,
246 * we may need to activate two extents in one go */ 263 * we may need to activate two extents in one go */
247 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 264 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
248 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 265 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
249 unsigned enr;
250 bool locked = false;
251 266
267 D_ASSERT((unsigned)(last - first) <= 1);
268 D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
269
270 /* FIXME figure out a fast path for bios crossing AL extent boundaries */
271 if (first != last)
272 return false;
273
274 return _al_get(mdev, first, true);
275}
276
277bool drbd_al_begin_io_prepare(struct drbd_conf *mdev, struct drbd_interval *i)
278{
279 /* for bios crossing activity log extent boundaries,
280 * we may need to activate two extents in one go */
281 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
282 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
283 unsigned enr;
284 bool need_transaction = false;
252 285
253 D_ASSERT(first <= last); 286 D_ASSERT(first <= last);
254 D_ASSERT(atomic_read(&mdev->local_cnt) > 0); 287 D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
255 288
256 for (enr = first; enr <= last; enr++) 289 for (enr = first; enr <= last; enr++) {
257 wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL); 290 struct lc_element *al_ext;
291 wait_event(mdev->al_wait,
292 (al_ext = _al_get(mdev, enr, false)) != NULL);
293 if (al_ext->lc_number != enr)
294 need_transaction = true;
295 }
296 return need_transaction;
297}
298
299static int al_write_transaction(struct drbd_conf *mdev, bool delegate);
300
301/* When called through generic_make_request(), we must delegate
302 * activity log I/O to the worker thread: a further request
303 * submitted via generic_make_request() within the same task
304 * would be queued on current->bio_list, and would only start
305 * after this function returns (see generic_make_request()).
306 *
307 * However, if we *are* the worker, we must not delegate to ourselves.
308 */
309
310/*
311 * @delegate: delegate activity log I/O to the worker thread
312 */
313void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate)
314{
315 bool locked = false;
316
317 BUG_ON(delegate && current == mdev->tconn->worker.task);
258 318
259 /* Serialize multiple transactions. 319 /* Serialize multiple transactions.
260 * This uses test_and_set_bit, memory barrier is implicit. 320 * This uses test_and_set_bit, memory barrier is implicit.
@@ -264,13 +324,6 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
264 (locked = lc_try_lock_for_transaction(mdev->act_log))); 324 (locked = lc_try_lock_for_transaction(mdev->act_log)));
265 325
266 if (locked) { 326 if (locked) {
267 /* drbd_al_write_transaction(mdev,al_ext,enr);
268 * recurses into generic_make_request(), which
269 * disallows recursion, bios being serialized on the
270 * current->bio_tail list now.
271 * we have to delegate updates to the activity log
272 * to the worker thread. */
273
274 /* Double check: it may have been committed by someone else, 327 /* Double check: it may have been committed by someone else,
275 * while we have been waiting for the lock. */ 328 * while we have been waiting for the lock. */
276 if (mdev->act_log->pending_changes) { 329 if (mdev->act_log->pending_changes) {
@@ -280,11 +333,8 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
280 write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; 333 write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
281 rcu_read_unlock(); 334 rcu_read_unlock();
282 335
283 if (write_al_updates) { 336 if (write_al_updates)
284 al_write_transaction(mdev); 337 al_write_transaction(mdev, delegate);
285 mdev->al_writ_cnt++;
286 }
287
288 spin_lock_irq(&mdev->al_lock); 338 spin_lock_irq(&mdev->al_lock);
289 /* FIXME 339 /* FIXME
290 if (err) 340 if (err)
@@ -298,6 +348,66 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
298 } 348 }
299} 349}
300 350
351/*
352 * @delegate: delegate activity log I/O to the worker thread
353 */
354void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate)
355{
356 BUG_ON(delegate && current == mdev->tconn->worker.task);
357
358 if (drbd_al_begin_io_prepare(mdev, i))
359 drbd_al_begin_io_commit(mdev, delegate);
360}
361
362int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i)
363{
364 struct lru_cache *al = mdev->act_log;
365 /* for bios crossing activity log extent boundaries,
366 * we may need to activate two extents in one go */
367 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
368 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
369 unsigned nr_al_extents;
370 unsigned available_update_slots;
371 unsigned enr;
372
373 D_ASSERT(first <= last);
374
375 nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */
376 available_update_slots = min(al->nr_elements - al->used,
377 al->max_pending_changes - al->pending_changes);
378
379 /* We want all necessary updates for a given request within the same transaction
380 * We could first check how many updates are *actually* needed,
381 * and use that instead of the worst-case nr_al_extents */
382 if (available_update_slots < nr_al_extents)
383 return -EWOULDBLOCK;
384
385 /* Is resync active in this area? */
386 for (enr = first; enr <= last; enr++) {
387 struct lc_element *tmp;
388 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
389 if (unlikely(tmp != NULL)) {
390 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
391 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
392 if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags))
393 return -EBUSY;
394 return -EWOULDBLOCK;
395 }
396 }
397 }
398
399 /* Checkout the refcounts.
400 * Given that we checked for available elements and update slots above,
401 * this has to be successful. */
402 for (enr = first; enr <= last; enr++) {
403 struct lc_element *al_ext;
404 al_ext = lc_get_cumulative(mdev->act_log, enr);
405 if (!al_ext)
406 dev_info(DEV, "LOGIC BUG for enr=%u\n", enr);
407 }
408 return 0;
409}
410
301void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) 411void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i)
302{ 412{
303 /* for bios crossing activity log extent boundaries, 413 /* for bios crossing activity log extent boundaries,
@@ -350,6 +460,24 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
350 (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); 460 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
351} 461}
352 462
463static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev)
464{
465 const unsigned int stripes = mdev->ldev->md.al_stripes;
466 const unsigned int stripe_size_4kB = mdev->ldev->md.al_stripe_size_4k;
467
468 /* transaction number, modulo on-disk ring buffer wrap around */
469 unsigned int t = mdev->al_tr_number % (mdev->ldev->md.al_size_4k);
470
471 /* ... to aligned 4k on disk block */
472 t = ((t % stripes) * stripe_size_4kB) + t/stripes;
473
474 /* ... to 512 byte sector in activity log */
475 t *= 8;
476
477 /* ... plus offset to the on disk position */
478 return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t;
479}
480
353static int 481static int
354_al_write_transaction(struct drbd_conf *mdev) 482_al_write_transaction(struct drbd_conf *mdev)
355{ 483{
@@ -432,23 +560,27 @@ _al_write_transaction(struct drbd_conf *mdev)
432 if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) 560 if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
433 mdev->al_tr_cycle = 0; 561 mdev->al_tr_cycle = 0;
434 562
435 sector = mdev->ldev->md.md_offset 563 sector = al_tr_number_to_on_disk_sector(mdev);
436 + mdev->ldev->md.al_offset
437 + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9);
438 564
439 crc = crc32c(0, buffer, 4096); 565 crc = crc32c(0, buffer, 4096);
440 buffer->crc32c = cpu_to_be32(crc); 566 buffer->crc32c = cpu_to_be32(crc);
441 567
442 if (drbd_bm_write_hinted(mdev)) 568 if (drbd_bm_write_hinted(mdev))
443 err = -EIO; 569 err = -EIO;
444 /* drbd_chk_io_error done already */ 570 else {
445 else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { 571 bool write_al_updates;
446 err = -EIO; 572 rcu_read_lock();
447 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); 573 write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
448 } else { 574 rcu_read_unlock();
449 /* advance ringbuffer position and transaction counter */ 575 if (write_al_updates) {
450 mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE); 576 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
451 mdev->al_tr_number++; 577 err = -EIO;
578 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
579 } else {
580 mdev->al_tr_number++;
581 mdev->al_writ_cnt++;
582 }
583 }
452 } 584 }
453 585
454 drbd_md_put_buffer(mdev); 586 drbd_md_put_buffer(mdev);
@@ -474,20 +606,18 @@ static int w_al_write_transaction(struct drbd_work *w, int unused)
474/* Calls from worker context (see w_restart_disk_io()) need to write the 606/* Calls from worker context (see w_restart_disk_io()) need to write the
475 transaction directly. Others came through generic_make_request(), 607 transaction directly. Others came through generic_make_request(),
476 those need to delegate it to the worker. */ 608 those need to delegate it to the worker. */
477static int al_write_transaction(struct drbd_conf *mdev) 609static int al_write_transaction(struct drbd_conf *mdev, bool delegate)
478{ 610{
479 struct update_al_work al_work; 611 if (delegate) {
480 612 struct update_al_work al_work;
481 if (current == mdev->tconn->worker.task) 613 init_completion(&al_work.event);
614 al_work.w.cb = w_al_write_transaction;
615 al_work.w.mdev = mdev;
616 drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
617 wait_for_completion(&al_work.event);
618 return al_work.err;
619 } else
482 return _al_write_transaction(mdev); 620 return _al_write_transaction(mdev);
483
484 init_completion(&al_work.event);
485 al_work.w.cb = w_al_write_transaction;
486 al_work.w.mdev = mdev;
487 drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
488 wait_for_completion(&al_work.event);
489
490 return al_work.err;
491} 621}
492 622
493static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) 623static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 8dc29502dc08..64fbb8385cdc 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -612,6 +612,17 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
612 } 612 }
613} 613}
614 614
615/* For the layout, see comment above drbd_md_set_sector_offsets(). */
616static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
617{
618 u64 bitmap_sectors;
619 if (ldev->md.al_offset == 8)
620 bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
621 else
622 bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
623 return bitmap_sectors << (9 + 3);
624}
625
615/* 626/*
616 * make sure the bitmap has enough room for the attached storage, 627 * make sure the bitmap has enough room for the attached storage,
617 * if necessary, resize. 628 * if necessary, resize.
@@ -668,7 +679,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
668 words = ALIGN(bits, 64) >> LN2_BPL; 679 words = ALIGN(bits, 64) >> LN2_BPL;
669 680
670 if (get_ldev(mdev)) { 681 if (get_ldev(mdev)) {
671 u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12; 682 u64 bits_on_disk = drbd_md_on_disk_bits(mdev->ldev);
672 put_ldev(mdev); 683 put_ldev(mdev);
673 if (bits > bits_on_disk) { 684 if (bits > bits_on_disk) {
674 dev_info(DEV, "bits = %lu\n", bits); 685 dev_info(DEV, "bits = %lu\n", bits);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 6b51afa1aae1..f943aacfdad8 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -753,13 +753,16 @@ struct drbd_md {
753 u32 flags; 753 u32 flags;
754 u32 md_size_sect; 754 u32 md_size_sect;
755 755
756 s32 al_offset; /* signed relative sector offset to al area */ 756 s32 al_offset; /* signed relative sector offset to activity log */
757 s32 bm_offset; /* signed relative sector offset to bitmap */ 757 s32 bm_offset; /* signed relative sector offset to bitmap */
758 758
759 /* u32 al_nr_extents; important for restoring the AL 759 /* cached value of bdev->disk_conf->meta_dev_idx (see below) */
760 * is stored into ldev->dc.al_extents, which in turn 760 s32 meta_dev_idx;
761 * gets applied to act_log->nr_elements 761
762 */ 762 /* see al_tr_number_to_on_disk_sector() */
763 u32 al_stripes;
764 u32 al_stripe_size_4k;
765 u32 al_size_4k; /* cached product of the above */
763}; 766};
764 767
765struct drbd_backing_dev { 768struct drbd_backing_dev {
@@ -891,6 +894,14 @@ struct drbd_tconn { /* is a resource from the config file */
891 } send; 894 } send;
892}; 895};
893 896
897struct submit_worker {
898 struct workqueue_struct *wq;
899 struct work_struct worker;
900
901 spinlock_t lock;
902 struct list_head writes;
903};
904
894struct drbd_conf { 905struct drbd_conf {
895 struct drbd_tconn *tconn; 906 struct drbd_tconn *tconn;
896 int vnr; /* volume number within the connection */ 907 int vnr; /* volume number within the connection */
@@ -1009,7 +1020,6 @@ struct drbd_conf {
1009 struct lru_cache *act_log; /* activity log */ 1020 struct lru_cache *act_log; /* activity log */
1010 unsigned int al_tr_number; 1021 unsigned int al_tr_number;
1011 int al_tr_cycle; 1022 int al_tr_cycle;
1012 int al_tr_pos; /* position of the next transaction in the journal */
1013 wait_queue_head_t seq_wait; 1023 wait_queue_head_t seq_wait;
1014 atomic_t packet_seq; 1024 atomic_t packet_seq;
1015 unsigned int peer_seq; 1025 unsigned int peer_seq;
@@ -1032,6 +1042,10 @@ struct drbd_conf {
1032 atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ 1042 atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
1033 unsigned int peer_max_bio_size; 1043 unsigned int peer_max_bio_size;
1034 unsigned int local_max_bio_size; 1044 unsigned int local_max_bio_size;
1045
1046 /* any requests that would block in drbd_make_request()
1047 * are deferred to this single-threaded work queue */
1048 struct submit_worker submit;
1035}; 1049};
1036 1050
1037static inline struct drbd_conf *minor_to_mdev(unsigned int minor) 1051static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1148,25 +1162,44 @@ extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1148 char *why, enum bm_flag flags); 1162 char *why, enum bm_flag flags);
1149extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); 1163extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
1150extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); 1164extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
1151extern void drbd_go_diskless(struct drbd_conf *mdev);
1152extern void drbd_ldev_destroy(struct drbd_conf *mdev); 1165extern void drbd_ldev_destroy(struct drbd_conf *mdev);
1153 1166
1154/* Meta data layout 1167/* Meta data layout
1155 We reserve a 128MB Block (4k aligned) 1168 *
1156 * either at the end of the backing device 1169 * We currently have two possible layouts.
1157 * or on a separate meta data device. */ 1170 * Offsets in (512 byte) sectors.
1171 * external:
1172 * |----------- md_size_sect ------------------|
1173 * [ 4k superblock ][ activity log ][ Bitmap ]
1174 * | al_offset == 8 |
1175 * | bm_offset = al_offset + X |
1176 * ==> bitmap sectors = md_size_sect - bm_offset
1177 *
1178 * Variants:
1179 * old, indexed fixed size meta data:
1180 *
1181 * internal:
1182 * |----------- md_size_sect ------------------|
1183 * [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*]
1184 * | al_offset < 0 |
1185 * | bm_offset = al_offset - Y |
1186 * ==> bitmap sectors = Y = al_offset - bm_offset
1187 *
1188 * [padding*] are zero or up to 7 unused 512 Byte sectors to the
1189 * end of the device, so that the [4k superblock] will be 4k aligned.
1190 *
1191 * The activity log consists of 4k transaction blocks,
1192 * which are written in a ring-buffer, or striped ring-buffer like fashion,
1193 * which are writtensize used to be fixed 32kB,
1194 * but is about to become configurable.
1195 */
1158 1196
1159/* The following numbers are sectors */ 1197/* Our old fixed size meta data layout
1160/* Allows up to about 3.8TB, so if you want more, 1198 * allows up to about 3.8TB, so if you want more,
1161 * you need to use the "flexible" meta data format. */ 1199 * you need to use the "flexible" meta data format. */
1162#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ 1200#define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */
1163#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ 1201#define MD_4kB_SECT 8
1164#define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */ 1202#define MD_32kB_SECT 64
1165#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS)
1166
1167/* we do all meta data IO in 4k blocks */
1168#define MD_BLOCK_SHIFT 12
1169#define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT)
1170 1203
1171/* One activity log extent represents 4M of storage */ 1204/* One activity log extent represents 4M of storage */
1172#define AL_EXTENT_SHIFT 22 1205#define AL_EXTENT_SHIFT 22
@@ -1256,7 +1289,6 @@ struct bm_extent {
1256 1289
1257/* in one sector of the bitmap, we have this many activity_log extents. */ 1290/* in one sector of the bitmap, we have this many activity_log extents. */
1258#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) 1291#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
1259#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
1260 1292
1261#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) 1293#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
1262#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1) 1294#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
@@ -1276,16 +1308,18 @@ struct bm_extent {
1276 */ 1308 */
1277 1309
1278#define DRBD_MAX_SECTORS_32 (0xffffffffLU) 1310#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
1279#define DRBD_MAX_SECTORS_BM \ 1311/* we have a certain meta data variant that has a fixed on-disk size of 128
1280 ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9))) 1312 * MiB, of which 4k are our "superblock", and 32k are the fixed size activity
1281#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32 1313 * log, leaving this many sectors for the bitmap.
1282#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM 1314 */
1283#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM 1315
1284#elif !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32 1316#define DRBD_MAX_SECTORS_FIXED_BM \
1317 ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
1318#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
1285#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32 1319#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
1286#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32 1320#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
1287#else 1321#else
1288#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM 1322#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM
1289/* 16 TB in units of sectors */ 1323/* 16 TB in units of sectors */
1290#if BITS_PER_LONG == 32 1324#if BITS_PER_LONG == 32
1291/* adjust by one page worth of bitmap, 1325/* adjust by one page worth of bitmap,
@@ -1418,6 +1452,7 @@ extern void conn_free_crypto(struct drbd_tconn *tconn);
1418extern int proc_details; 1452extern int proc_details;
1419 1453
1420/* drbd_req */ 1454/* drbd_req */
1455extern void do_submit(struct work_struct *ws);
1421extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long); 1456extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
1422extern void drbd_make_request(struct request_queue *q, struct bio *bio); 1457extern void drbd_make_request(struct request_queue *q, struct bio *bio);
1423extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); 1458extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
@@ -1576,7 +1611,10 @@ extern const char *drbd_conn_str(enum drbd_conns s);
1576extern const char *drbd_role_str(enum drbd_role s); 1611extern const char *drbd_role_str(enum drbd_role s);
1577 1612
1578/* drbd_actlog.c */ 1613/* drbd_actlog.c */
1579extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i); 1614extern int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i);
1615extern void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate);
1616extern bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i);
1617extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate);
1580extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i); 1618extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i);
1581extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); 1619extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
1582extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); 1620extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
@@ -1755,9 +1793,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
1755 * BTW, for internal meta data, this happens to be the maximum capacity 1793 * BTW, for internal meta data, this happens to be the maximum capacity
1756 * we could agree upon with our peer node. 1794 * we could agree upon with our peer node.
1757 */ 1795 */
1758static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev) 1796static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
1759{ 1797{
1760 switch (meta_dev_idx) { 1798 switch (bdev->md.meta_dev_idx) {
1761 case DRBD_MD_INDEX_INTERNAL: 1799 case DRBD_MD_INDEX_INTERNAL:
1762 case DRBD_MD_INDEX_FLEX_INT: 1800 case DRBD_MD_INDEX_FLEX_INT:
1763 return bdev->md.md_offset + bdev->md.bm_offset; 1801 return bdev->md.md_offset + bdev->md.bm_offset;
@@ -1767,36 +1805,19 @@ static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backi
1767 } 1805 }
1768} 1806}
1769 1807
1770static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
1771{
1772 int meta_dev_idx;
1773
1774 rcu_read_lock();
1775 meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
1776 rcu_read_unlock();
1777
1778 return _drbd_md_first_sector(meta_dev_idx, bdev);
1779}
1780
1781/** 1808/**
1782 * drbd_md_last_sector() - Return the last sector number of the meta data area 1809 * drbd_md_last_sector() - Return the last sector number of the meta data area
1783 * @bdev: Meta data block device. 1810 * @bdev: Meta data block device.
1784 */ 1811 */
1785static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) 1812static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
1786{ 1813{
1787 int meta_dev_idx; 1814 switch (bdev->md.meta_dev_idx) {
1788
1789 rcu_read_lock();
1790 meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
1791 rcu_read_unlock();
1792
1793 switch (meta_dev_idx) {
1794 case DRBD_MD_INDEX_INTERNAL: 1815 case DRBD_MD_INDEX_INTERNAL:
1795 case DRBD_MD_INDEX_FLEX_INT: 1816 case DRBD_MD_INDEX_FLEX_INT:
1796 return bdev->md.md_offset + MD_AL_OFFSET - 1; 1817 return bdev->md.md_offset + MD_4kB_SECT -1;
1797 case DRBD_MD_INDEX_FLEX_EXT: 1818 case DRBD_MD_INDEX_FLEX_EXT:
1798 default: 1819 default:
1799 return bdev->md.md_offset + bdev->md.md_size_sect; 1820 return bdev->md.md_offset + bdev->md.md_size_sect -1;
1800 } 1821 }
1801} 1822}
1802 1823
@@ -1818,18 +1839,13 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev)
1818static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) 1839static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
1819{ 1840{
1820 sector_t s; 1841 sector_t s;
1821 int meta_dev_idx;
1822 1842
1823 rcu_read_lock(); 1843 switch (bdev->md.meta_dev_idx) {
1824 meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
1825 rcu_read_unlock();
1826
1827 switch (meta_dev_idx) {
1828 case DRBD_MD_INDEX_INTERNAL: 1844 case DRBD_MD_INDEX_INTERNAL:
1829 case DRBD_MD_INDEX_FLEX_INT: 1845 case DRBD_MD_INDEX_FLEX_INT:
1830 s = drbd_get_capacity(bdev->backing_bdev) 1846 s = drbd_get_capacity(bdev->backing_bdev)
1831 ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, 1847 ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
1832 _drbd_md_first_sector(meta_dev_idx, bdev)) 1848 drbd_md_first_sector(bdev))
1833 : 0; 1849 : 0;
1834 break; 1850 break;
1835 case DRBD_MD_INDEX_FLEX_EXT: 1851 case DRBD_MD_INDEX_FLEX_EXT:
@@ -1848,39 +1864,24 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
1848} 1864}
1849 1865
1850/** 1866/**
1851 * drbd_md_ss__() - Return the sector number of our meta data super block 1867 * drbd_md_ss() - Return the sector number of our meta data super block
1852 * @mdev: DRBD device.
1853 * @bdev: Meta data block device. 1868 * @bdev: Meta data block device.
1854 */ 1869 */
1855static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, 1870static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
1856 struct drbd_backing_dev *bdev)
1857{ 1871{
1858 int meta_dev_idx; 1872 const int meta_dev_idx = bdev->md.meta_dev_idx;
1859 1873
1860 rcu_read_lock(); 1874 if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT)
1861 meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
1862 rcu_read_unlock();
1863
1864 switch (meta_dev_idx) {
1865 default: /* external, some index */
1866 return MD_RESERVED_SECT * meta_dev_idx;
1867 case DRBD_MD_INDEX_INTERNAL:
1868 /* with drbd08, internal meta data is always "flexible" */
1869 case DRBD_MD_INDEX_FLEX_INT:
1870 /* sizeof(struct md_on_disk_07) == 4k
1871 * position: last 4k aligned block of 4k size */
1872 if (!bdev->backing_bdev) {
1873 if (__ratelimit(&drbd_ratelimit_state)) {
1874 dev_err(DEV, "bdev->backing_bdev==NULL\n");
1875 dump_stack();
1876 }
1877 return 0;
1878 }
1879 return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
1880 - MD_AL_OFFSET;
1881 case DRBD_MD_INDEX_FLEX_EXT:
1882 return 0; 1875 return 0;
1883 } 1876
1877 /* Since drbd08, internal meta data is always "flexible".
1878 * position: last 4k aligned block of 4k size */
1879 if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
1880 meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)
1881 return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8;
1882
1883 /* external, some index; this is the old fixed size layout */
1884 return MD_128MB_SECT * bdev->md.meta_dev_idx;
1884} 1885}
1885 1886
1886static inline void 1887static inline void
@@ -2053,9 +2054,11 @@ static inline void put_ldev(struct drbd_conf *mdev)
2053 if (mdev->state.disk == D_DISKLESS) 2054 if (mdev->state.disk == D_DISKLESS)
2054 /* even internal references gone, safe to destroy */ 2055 /* even internal references gone, safe to destroy */
2055 drbd_ldev_destroy(mdev); 2056 drbd_ldev_destroy(mdev);
2056 if (mdev->state.disk == D_FAILED) 2057 if (mdev->state.disk == D_FAILED) {
2057 /* all application IO references gone. */ 2058 /* all application IO references gone. */
2058 drbd_go_diskless(mdev); 2059 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
2060 drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless);
2061 }
2059 wake_up(&mdev->misc_wait); 2062 wake_up(&mdev->misc_wait);
2060 } 2063 }
2061} 2064}
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 298b868910dc..a5dca6affcbb 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -45,7 +45,7 @@
45#include <linux/reboot.h> 45#include <linux/reboot.h>
46#include <linux/notifier.h> 46#include <linux/notifier.h>
47#include <linux/kthread.h> 47#include <linux/kthread.h>
48 48#include <linux/workqueue.h>
49#define __KERNEL_SYSCALLS__ 49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h> 50#include <linux/unistd.h>
51#include <linux/vmalloc.h> 51#include <linux/vmalloc.h>
@@ -2299,6 +2299,7 @@ static void drbd_cleanup(void)
2299 idr_for_each_entry(&minors, mdev, i) { 2299 idr_for_each_entry(&minors, mdev, i) {
2300 idr_remove(&minors, mdev_to_minor(mdev)); 2300 idr_remove(&minors, mdev_to_minor(mdev));
2301 idr_remove(&mdev->tconn->volumes, mdev->vnr); 2301 idr_remove(&mdev->tconn->volumes, mdev->vnr);
2302 destroy_workqueue(mdev->submit.wq);
2302 del_gendisk(mdev->vdisk); 2303 del_gendisk(mdev->vdisk);
2303 /* synchronize_rcu(); No other threads running at this point */ 2304 /* synchronize_rcu(); No other threads running at this point */
2304 kref_put(&mdev->kref, &drbd_minor_destroy); 2305 kref_put(&mdev->kref, &drbd_minor_destroy);
@@ -2588,6 +2589,21 @@ void conn_destroy(struct kref *kref)
2588 kfree(tconn); 2589 kfree(tconn);
2589} 2590}
2590 2591
2592int init_submitter(struct drbd_conf *mdev)
2593{
2594 /* opencoded create_singlethread_workqueue(),
2595 * to be able to say "drbd%d", ..., minor */
2596 mdev->submit.wq = alloc_workqueue("drbd%u_submit",
2597 WQ_UNBOUND | WQ_MEM_RECLAIM, 1, mdev->minor);
2598 if (!mdev->submit.wq)
2599 return -ENOMEM;
2600
2601 INIT_WORK(&mdev->submit.worker, do_submit);
2602 spin_lock_init(&mdev->submit.lock);
2603 INIT_LIST_HEAD(&mdev->submit.writes);
2604 return 0;
2605}
2606
2591enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr) 2607enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr)
2592{ 2608{
2593 struct drbd_conf *mdev; 2609 struct drbd_conf *mdev;
@@ -2677,6 +2693,12 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor,
2677 goto out_idr_remove_minor; 2693 goto out_idr_remove_minor;
2678 } 2694 }
2679 2695
2696 if (init_submitter(mdev)) {
2697 err = ERR_NOMEM;
2698 drbd_msg_put_info("unable to create submit workqueue");
2699 goto out_idr_remove_vol;
2700 }
2701
2680 add_disk(disk); 2702 add_disk(disk);
2681 kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */ 2703 kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */
2682 2704
@@ -2687,6 +2709,8 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor,
2687 2709
2688 return NO_ERROR; 2710 return NO_ERROR;
2689 2711
2712out_idr_remove_vol:
2713 idr_remove(&tconn->volumes, vnr_got);
2690out_idr_remove_minor: 2714out_idr_remove_minor:
2691 idr_remove(&minors, minor_got); 2715 idr_remove(&minors, minor_got);
2692 synchronize_rcu(); 2716 synchronize_rcu();
@@ -2794,6 +2818,7 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
2794 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 2818 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2795 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 2819 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2796 2820
2821 kfree(ldev->disk_conf);
2797 kfree(ldev); 2822 kfree(ldev);
2798} 2823}
2799 2824
@@ -2833,8 +2858,9 @@ void conn_md_sync(struct drbd_tconn *tconn)
2833 rcu_read_unlock(); 2858 rcu_read_unlock();
2834} 2859}
2835 2860
2861/* aligned 4kByte */
2836struct meta_data_on_disk { 2862struct meta_data_on_disk {
2837 u64 la_size; /* last agreed size. */ 2863 u64 la_size_sect; /* last agreed size. */
2838 u64 uuid[UI_SIZE]; /* UUIDs. */ 2864 u64 uuid[UI_SIZE]; /* UUIDs. */
2839 u64 device_uuid; 2865 u64 device_uuid;
2840 u64 reserved_u64_1; 2866 u64 reserved_u64_1;
@@ -2842,13 +2868,17 @@ struct meta_data_on_disk {
2842 u32 magic; 2868 u32 magic;
2843 u32 md_size_sect; 2869 u32 md_size_sect;
2844 u32 al_offset; /* offset to this block */ 2870 u32 al_offset; /* offset to this block */
2845 u32 al_nr_extents; /* important for restoring the AL */ 2871 u32 al_nr_extents; /* important for restoring the AL (userspace) */
2846 /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ 2872 /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
2847 u32 bm_offset; /* offset to the bitmap, from here */ 2873 u32 bm_offset; /* offset to the bitmap, from here */
2848 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ 2874 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
2849 u32 la_peer_max_bio_size; /* last peer max_bio_size */ 2875 u32 la_peer_max_bio_size; /* last peer max_bio_size */
2850 u32 reserved_u32[3];
2851 2876
2877 /* see al_tr_number_to_on_disk_sector() */
2878 u32 al_stripes;
2879 u32 al_stripe_size_4k;
2880
2881 u8 reserved_u8[4096 - (7*8 + 10*4)];
2852} __packed; 2882} __packed;
2853 2883
2854/** 2884/**
@@ -2861,6 +2891,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
2861 sector_t sector; 2891 sector_t sector;
2862 int i; 2892 int i;
2863 2893
2894 /* Don't accidentally change the DRBD meta data layout. */
2895 BUILD_BUG_ON(UI_SIZE != 4);
2896 BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
2897
2864 del_timer(&mdev->md_sync_timer); 2898 del_timer(&mdev->md_sync_timer);
2865 /* timer may be rearmed by drbd_md_mark_dirty() now. */ 2899 /* timer may be rearmed by drbd_md_mark_dirty() now. */
2866 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) 2900 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
@@ -2875,9 +2909,9 @@ void drbd_md_sync(struct drbd_conf *mdev)
2875 if (!buffer) 2909 if (!buffer)
2876 goto out; 2910 goto out;
2877 2911
2878 memset(buffer, 0, 512); 2912 memset(buffer, 0, sizeof(*buffer));
2879 2913
2880 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); 2914 buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
2881 for (i = UI_CURRENT; i < UI_SIZE; i++) 2915 for (i = UI_CURRENT; i < UI_SIZE; i++)
2882 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); 2916 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
2883 buffer->flags = cpu_to_be32(mdev->ldev->md.flags); 2917 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
@@ -2892,7 +2926,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
2892 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); 2926 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
2893 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size); 2927 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
2894 2928
2895 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); 2929 buffer->al_stripes = cpu_to_be32(mdev->ldev->md.al_stripes);
2930 buffer->al_stripe_size_4k = cpu_to_be32(mdev->ldev->md.al_stripe_size_4k);
2931
2932 D_ASSERT(drbd_md_ss(mdev->ldev) == mdev->ldev->md.md_offset);
2896 sector = mdev->ldev->md.md_offset; 2933 sector = mdev->ldev->md.md_offset;
2897 2934
2898 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { 2935 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
@@ -2910,13 +2947,141 @@ out:
2910 put_ldev(mdev); 2947 put_ldev(mdev);
2911} 2948}
2912 2949
2950static int check_activity_log_stripe_size(struct drbd_conf *mdev,
2951 struct meta_data_on_disk *on_disk,
2952 struct drbd_md *in_core)
2953{
2954 u32 al_stripes = be32_to_cpu(on_disk->al_stripes);
2955 u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k);
2956 u64 al_size_4k;
2957
2958 /* both not set: default to old fixed size activity log */
2959 if (al_stripes == 0 && al_stripe_size_4k == 0) {
2960 al_stripes = 1;
2961 al_stripe_size_4k = MD_32kB_SECT/8;
2962 }
2963
2964 /* some paranoia plausibility checks */
2965
2966 /* we need both values to be set */
2967 if (al_stripes == 0 || al_stripe_size_4k == 0)
2968 goto err;
2969
2970 al_size_4k = (u64)al_stripes * al_stripe_size_4k;
2971
2972 /* Upper limit of activity log area, to avoid potential overflow
2973 * problems in al_tr_number_to_on_disk_sector(). As right now, more
2974 * than 72 * 4k blocks total only increases the amount of history,
2975 * limiting this arbitrarily to 16 GB is not a real limitation ;-) */
2976 if (al_size_4k > (16 * 1024 * 1024/4))
2977 goto err;
2978
2979 /* Lower limit: we need at least 8 transaction slots (32kB)
2980 * to not break existing setups */
2981 if (al_size_4k < MD_32kB_SECT/8)
2982 goto err;
2983
2984 in_core->al_stripe_size_4k = al_stripe_size_4k;
2985 in_core->al_stripes = al_stripes;
2986 in_core->al_size_4k = al_size_4k;
2987
2988 return 0;
2989err:
2990 dev_err(DEV, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n",
2991 al_stripes, al_stripe_size_4k);
2992 return -EINVAL;
2993}
2994
2995static int check_offsets_and_sizes(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
2996{
2997 sector_t capacity = drbd_get_capacity(bdev->md_bdev);
2998 struct drbd_md *in_core = &bdev->md;
2999 s32 on_disk_al_sect;
3000 s32 on_disk_bm_sect;
3001
3002 /* The on-disk size of the activity log, calculated from offsets, and
3003 * the size of the activity log calculated from the stripe settings,
3004 * should match.
3005 * Though we could relax this a bit: it is ok, if the striped activity log
3006 * fits in the available on-disk activity log size.
3007 * Right now, that would break how resize is implemented.
3008 * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware
3009 * of possible unused padding space in the on disk layout. */
3010 if (in_core->al_offset < 0) {
3011 if (in_core->bm_offset > in_core->al_offset)
3012 goto err;
3013 on_disk_al_sect = -in_core->al_offset;
3014 on_disk_bm_sect = in_core->al_offset - in_core->bm_offset;
3015 } else {
3016 if (in_core->al_offset != MD_4kB_SECT)
3017 goto err;
3018 if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT)
3019 goto err;
3020
3021 on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT;
3022 on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset;
3023 }
3024
3025 /* old fixed size meta data is exactly that: fixed. */
3026 if (in_core->meta_dev_idx >= 0) {
3027 if (in_core->md_size_sect != MD_128MB_SECT
3028 || in_core->al_offset != MD_4kB_SECT
3029 || in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT
3030 || in_core->al_stripes != 1
3031 || in_core->al_stripe_size_4k != MD_32kB_SECT/8)
3032 goto err;
3033 }
3034
3035 if (capacity < in_core->md_size_sect)
3036 goto err;
3037 if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev))
3038 goto err;
3039
3040 /* should be aligned, and at least 32k */
3041 if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT))
3042 goto err;
3043
3044 /* should fit (for now: exactly) into the available on-disk space;
3045 * overflow prevention is in check_activity_log_stripe_size() above. */
3046 if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT)
3047 goto err;
3048
3049 /* again, should be aligned */
3050 if (in_core->bm_offset & 7)
3051 goto err;
3052
3053 /* FIXME check for device grow with flex external meta data? */
3054
3055 /* can the available bitmap space cover the last agreed device size? */
3056 if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512)
3057 goto err;
3058
3059 return 0;
3060
3061err:
3062 dev_err(DEV, "meta data offsets don't make sense: idx=%d "
3063 "al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, "
3064 "md_size_sect=%u, la_size=%llu, md_capacity=%llu\n",
3065 in_core->meta_dev_idx,
3066 in_core->al_stripes, in_core->al_stripe_size_4k,
3067 in_core->al_offset, in_core->bm_offset, in_core->md_size_sect,
3068 (unsigned long long)in_core->la_size_sect,
3069 (unsigned long long)capacity);
3070
3071 return -EINVAL;
3072}
3073
3074
2913/** 3075/**
2914 * drbd_md_read() - Reads in the meta data super block 3076 * drbd_md_read() - Reads in the meta data super block
2915 * @mdev: DRBD device. 3077 * @mdev: DRBD device.
2916 * @bdev: Device from which the meta data should be read in. 3078 * @bdev: Device from which the meta data should be read in.
2917 * 3079 *
2918 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case 3080 * Return NO_ERROR on success, and an enum drbd_ret_code in case
2919 * something goes wrong. 3081 * something goes wrong.
3082 *
3083 * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS,
3084 * even before @bdev is assigned to @mdev->ldev.
2920 */ 3085 */
2921int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) 3086int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
2922{ 3087{
@@ -2924,12 +3089,17 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
2924 u32 magic, flags; 3089 u32 magic, flags;
2925 int i, rv = NO_ERROR; 3090 int i, rv = NO_ERROR;
2926 3091
2927 if (!get_ldev_if_state(mdev, D_ATTACHING)) 3092 if (mdev->state.disk != D_DISKLESS)
2928 return ERR_IO_MD_DISK; 3093 return ERR_DISK_CONFIGURED;
2929 3094
2930 buffer = drbd_md_get_buffer(mdev); 3095 buffer = drbd_md_get_buffer(mdev);
2931 if (!buffer) 3096 if (!buffer)
2932 goto out; 3097 return ERR_NOMEM;
3098
3099 /* First, figure out where our meta data superblock is located,
3100 * and read it. */
3101 bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
3102 bdev->md.md_offset = drbd_md_ss(bdev);
2933 3103
2934 if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { 3104 if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
2935 /* NOTE: can't do normal error processing here as this is 3105 /* NOTE: can't do normal error processing here as this is
@@ -2948,45 +3118,51 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
2948 rv = ERR_MD_UNCLEAN; 3118 rv = ERR_MD_UNCLEAN;
2949 goto err; 3119 goto err;
2950 } 3120 }
3121
3122 rv = ERR_MD_INVALID;
2951 if (magic != DRBD_MD_MAGIC_08) { 3123 if (magic != DRBD_MD_MAGIC_08) {
2952 if (magic == DRBD_MD_MAGIC_07) 3124 if (magic == DRBD_MD_MAGIC_07)
2953 dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n"); 3125 dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
2954 else 3126 else
2955 dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n"); 3127 dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
2956 rv = ERR_MD_INVALID;
2957 goto err; 3128 goto err;
2958 } 3129 }
2959 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { 3130
2960 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n", 3131 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
2961 be32_to_cpu(buffer->al_offset), bdev->md.al_offset); 3132 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
2962 rv = ERR_MD_INVALID; 3133 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
2963 goto err; 3134 goto err;
2964 } 3135 }
3136
3137
3138 /* convert to in_core endian */
3139 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect);
3140 for (i = UI_CURRENT; i < UI_SIZE; i++)
3141 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3142 bdev->md.flags = be32_to_cpu(buffer->flags);
3143 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3144
3145 bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect);
3146 bdev->md.al_offset = be32_to_cpu(buffer->al_offset);
3147 bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset);
3148
3149 if (check_activity_log_stripe_size(mdev, buffer, &bdev->md))
3150 goto err;
3151 if (check_offsets_and_sizes(mdev, bdev))
3152 goto err;
3153
2965 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { 3154 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
2966 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", 3155 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
2967 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); 3156 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
2968 rv = ERR_MD_INVALID;
2969 goto err; 3157 goto err;
2970 } 3158 }
2971 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { 3159 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
2972 dev_err(DEV, "unexpected md_size: %u (expected %u)\n", 3160 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
2973 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); 3161 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
2974 rv = ERR_MD_INVALID;
2975 goto err; 3162 goto err;
2976 } 3163 }
2977 3164
2978 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { 3165 rv = NO_ERROR;
2979 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
2980 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
2981 rv = ERR_MD_INVALID;
2982 goto err;
2983 }
2984
2985 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
2986 for (i = UI_CURRENT; i < UI_SIZE; i++)
2987 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
2988 bdev->md.flags = be32_to_cpu(buffer->flags);
2989 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
2990 3166
2991 spin_lock_irq(&mdev->tconn->req_lock); 3167 spin_lock_irq(&mdev->tconn->req_lock);
2992 if (mdev->state.conn < C_CONNECTED) { 3168 if (mdev->state.conn < C_CONNECTED) {
@@ -2999,8 +3175,6 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
2999 3175
3000 err: 3176 err:
3001 drbd_md_put_buffer(mdev); 3177 drbd_md_put_buffer(mdev);
3002 out:
3003 put_ldev(mdev);
3004 3178
3005 return rv; 3179 return rv;
3006} 3180}
@@ -3238,8 +3412,12 @@ static int w_go_diskless(struct drbd_work *w, int unused)
3238 * end up here after a failed attach, before ldev was even assigned. 3412 * end up here after a failed attach, before ldev was even assigned.
3239 */ 3413 */
3240 if (mdev->bitmap && mdev->ldev) { 3414 if (mdev->bitmap && mdev->ldev) {
3415 /* An interrupted resync or similar is allowed to recounts bits
3416 * while we detach.
3417 * Any modifications would not be expected anymore, though.
3418 */
3241 if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write, 3419 if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write,
3242 "detach", BM_LOCKED_MASK)) { 3420 "detach", BM_LOCKED_TEST_ALLOWED)) {
3243 if (test_bit(WAS_READ_ERROR, &mdev->flags)) { 3421 if (test_bit(WAS_READ_ERROR, &mdev->flags)) {
3244 drbd_md_set_flag(mdev, MDF_FULL_SYNC); 3422 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3245 drbd_md_sync(mdev); 3423 drbd_md_sync(mdev);
@@ -3251,13 +3429,6 @@ static int w_go_diskless(struct drbd_work *w, int unused)
3251 return 0; 3429 return 0;
3252} 3430}
3253 3431
3254void drbd_go_diskless(struct drbd_conf *mdev)
3255{
3256 D_ASSERT(mdev->state.disk == D_FAILED);
3257 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
3258 drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless);
3259}
3260
3261/** 3432/**
3262 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap 3433 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3263 * @mdev: DRBD device. 3434 * @mdev: DRBD device.
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 2af26fc95280..9e3f441e7e84 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -696,37 +696,52 @@ out:
696 return 0; 696 return 0;
697} 697}
698 698
699/* initializes the md.*_offset members, so we are able to find 699/* Initializes the md.*_offset members, so we are able to find
700 * the on disk meta data */ 700 * the on disk meta data.
701 *
702 * We currently have two possible layouts:
703 * external:
704 * |----------- md_size_sect ------------------|
705 * [ 4k superblock ][ activity log ][ Bitmap ]
706 * | al_offset == 8 |
707 * | bm_offset = al_offset + X |
708 * ==> bitmap sectors = md_size_sect - bm_offset
709 *
710 * internal:
711 * |----------- md_size_sect ------------------|
712 * [data.....][ Bitmap ][ activity log ][ 4k superblock ]
713 * | al_offset < 0 |
714 * | bm_offset = al_offset - Y |
715 * ==> bitmap sectors = Y = al_offset - bm_offset
716 *
717 * Activity log size used to be fixed 32kB,
718 * but is about to become configurable.
719 */
701static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, 720static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
702 struct drbd_backing_dev *bdev) 721 struct drbd_backing_dev *bdev)
703{ 722{
704 sector_t md_size_sect = 0; 723 sector_t md_size_sect = 0;
705 int meta_dev_idx; 724 unsigned int al_size_sect = bdev->md.al_size_4k * 8;
706 725
707 rcu_read_lock(); 726 bdev->md.md_offset = drbd_md_ss(bdev);
708 meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
709 727
710 switch (meta_dev_idx) { 728 switch (bdev->md.meta_dev_idx) {
711 default: 729 default:
712 /* v07 style fixed size indexed meta data */ 730 /* v07 style fixed size indexed meta data */
713 bdev->md.md_size_sect = MD_RESERVED_SECT; 731 bdev->md.md_size_sect = MD_128MB_SECT;
714 bdev->md.md_offset = drbd_md_ss__(mdev, bdev); 732 bdev->md.al_offset = MD_4kB_SECT;
715 bdev->md.al_offset = MD_AL_OFFSET; 733 bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
716 bdev->md.bm_offset = MD_BM_OFFSET;
717 break; 734 break;
718 case DRBD_MD_INDEX_FLEX_EXT: 735 case DRBD_MD_INDEX_FLEX_EXT:
719 /* just occupy the full device; unit: sectors */ 736 /* just occupy the full device; unit: sectors */
720 bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); 737 bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
721 bdev->md.md_offset = 0; 738 bdev->md.al_offset = MD_4kB_SECT;
722 bdev->md.al_offset = MD_AL_OFFSET; 739 bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
723 bdev->md.bm_offset = MD_BM_OFFSET;
724 break; 740 break;
725 case DRBD_MD_INDEX_INTERNAL: 741 case DRBD_MD_INDEX_INTERNAL:
726 case DRBD_MD_INDEX_FLEX_INT: 742 case DRBD_MD_INDEX_FLEX_INT:
727 bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
728 /* al size is still fixed */ 743 /* al size is still fixed */
729 bdev->md.al_offset = -MD_AL_SECTORS; 744 bdev->md.al_offset = -al_size_sect;
730 /* we need (slightly less than) ~ this much bitmap sectors: */ 745 /* we need (slightly less than) ~ this much bitmap sectors: */
731 md_size_sect = drbd_get_capacity(bdev->backing_bdev); 746 md_size_sect = drbd_get_capacity(bdev->backing_bdev);
732 md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); 747 md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
@@ -735,14 +750,13 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
735 750
736 /* plus the "drbd meta data super block", 751 /* plus the "drbd meta data super block",
737 * and the activity log; */ 752 * and the activity log; */
738 md_size_sect += MD_BM_OFFSET; 753 md_size_sect += MD_4kB_SECT + al_size_sect;
739 754
740 bdev->md.md_size_sect = md_size_sect; 755 bdev->md.md_size_sect = md_size_sect;
741 /* bitmap offset is adjusted by 'super' block size */ 756 /* bitmap offset is adjusted by 'super' block size */
742 bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; 757 bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT;
743 break; 758 break;
744 } 759 }
745 rcu_read_unlock();
746} 760}
747 761
748/* input size is expected to be in KB */ 762/* input size is expected to be in KB */
@@ -805,7 +819,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
805enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) 819enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
806{ 820{
807 sector_t prev_first_sect, prev_size; /* previous meta location */ 821 sector_t prev_first_sect, prev_size; /* previous meta location */
808 sector_t la_size, u_size; 822 sector_t la_size_sect, u_size;
809 sector_t size; 823 sector_t size;
810 char ppb[10]; 824 char ppb[10];
811 825
@@ -828,7 +842,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
828 842
829 prev_first_sect = drbd_md_first_sector(mdev->ldev); 843 prev_first_sect = drbd_md_first_sector(mdev->ldev);
830 prev_size = mdev->ldev->md.md_size_sect; 844 prev_size = mdev->ldev->md.md_size_sect;
831 la_size = mdev->ldev->md.la_size_sect; 845 la_size_sect = mdev->ldev->md.la_size_sect;
832 846
833 /* TODO: should only be some assert here, not (re)init... */ 847 /* TODO: should only be some assert here, not (re)init... */
834 drbd_md_set_sector_offsets(mdev, mdev->ldev); 848 drbd_md_set_sector_offsets(mdev, mdev->ldev);
@@ -864,7 +878,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
864 if (rv == dev_size_error) 878 if (rv == dev_size_error)
865 goto out; 879 goto out;
866 880
867 la_size_changed = (la_size != mdev->ldev->md.la_size_sect); 881 la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect);
868 882
869 md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) 883 md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
870 || prev_size != mdev->ldev->md.md_size_sect; 884 || prev_size != mdev->ldev->md.md_size_sect;
@@ -886,9 +900,9 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
886 drbd_md_mark_dirty(mdev); 900 drbd_md_mark_dirty(mdev);
887 } 901 }
888 902
889 if (size > la_size) 903 if (size > la_size_sect)
890 rv = grew; 904 rv = grew;
891 if (size < la_size) 905 if (size < la_size_sect)
892 rv = shrunk; 906 rv = shrunk;
893out: 907out:
894 lc_unlock(mdev->act_log); 908 lc_unlock(mdev->act_log);
@@ -903,7 +917,7 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
903 sector_t u_size, int assume_peer_has_space) 917 sector_t u_size, int assume_peer_has_space)
904{ 918{
905 sector_t p_size = mdev->p_size; /* partner's disk size. */ 919 sector_t p_size = mdev->p_size; /* partner's disk size. */
906 sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ 920 sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
907 sector_t m_size; /* my size */ 921 sector_t m_size; /* my size */
908 sector_t size = 0; 922 sector_t size = 0;
909 923
@@ -917,8 +931,8 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
917 if (p_size && m_size) { 931 if (p_size && m_size) {
918 size = min_t(sector_t, p_size, m_size); 932 size = min_t(sector_t, p_size, m_size);
919 } else { 933 } else {
920 if (la_size) { 934 if (la_size_sect) {
921 size = la_size; 935 size = la_size_sect;
922 if (m_size && m_size < size) 936 if (m_size && m_size < size)
923 size = m_size; 937 size = m_size;
924 if (p_size && p_size < size) 938 if (p_size && p_size < size)
@@ -1127,15 +1141,32 @@ static bool should_set_defaults(struct genl_info *info)
1127 return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); 1141 return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
1128} 1142}
1129 1143
1130static void enforce_disk_conf_limits(struct disk_conf *dc) 1144static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
1131{ 1145{
1132 if (dc->al_extents < DRBD_AL_EXTENTS_MIN) 1146 /* This is limited by 16 bit "slot" numbers,
1133 dc->al_extents = DRBD_AL_EXTENTS_MIN; 1147 * and by available on-disk context storage.
1134 if (dc->al_extents > DRBD_AL_EXTENTS_MAX) 1148 *
1135 dc->al_extents = DRBD_AL_EXTENTS_MAX; 1149 * Also (u16)~0 is special (denotes a "free" extent).
1150 *
1151 * One transaction occupies one 4kB on-disk block,
1152 * we have n such blocks in the on disk ring buffer,
1153 * the "current" transaction may fail (n-1),
1154 * and there is 919 slot numbers context information per transaction.
1155 *
1156 * 72 transaction blocks amounts to more than 2**16 context slots,
1157 * so cap there first.
1158 */
1159 const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
1160 const unsigned int sufficient_on_disk =
1161 (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
1162 /AL_CONTEXT_PER_TRANSACTION;
1163
1164 unsigned int al_size_4k = bdev->md.al_size_4k;
1165
1166 if (al_size_4k > sufficient_on_disk)
1167 return max_al_nr;
1136 1168
1137 if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) 1169 return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
1138 dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
1139} 1170}
1140 1171
1141int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) 1172int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
@@ -1182,7 +1213,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1182 if (!expect(new_disk_conf->resync_rate >= 1)) 1213 if (!expect(new_disk_conf->resync_rate >= 1))
1183 new_disk_conf->resync_rate = 1; 1214 new_disk_conf->resync_rate = 1;
1184 1215
1185 enforce_disk_conf_limits(new_disk_conf); 1216 if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
1217 new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1218 if (new_disk_conf->al_extents > drbd_al_extents_max(mdev->ldev))
1219 new_disk_conf->al_extents = drbd_al_extents_max(mdev->ldev);
1220
1221 if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
1222 new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
1186 1223
1187 fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; 1224 fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
1188 if (fifo_size != mdev->rs_plan_s->size) { 1225 if (fifo_size != mdev->rs_plan_s->size) {
@@ -1330,7 +1367,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1330 goto fail; 1367 goto fail;
1331 } 1368 }
1332 1369
1333 enforce_disk_conf_limits(new_disk_conf); 1370 if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
1371 new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
1334 1372
1335 new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ); 1373 new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
1336 if (!new_plan) { 1374 if (!new_plan) {
@@ -1343,6 +1381,12 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1343 goto fail; 1381 goto fail;
1344 } 1382 }
1345 1383
1384 write_lock_irq(&global_state_lock);
1385 retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after);
1386 write_unlock_irq(&global_state_lock);
1387 if (retcode != NO_ERROR)
1388 goto fail;
1389
1346 rcu_read_lock(); 1390 rcu_read_lock();
1347 nc = rcu_dereference(mdev->tconn->net_conf); 1391 nc = rcu_dereference(mdev->tconn->net_conf);
1348 if (nc) { 1392 if (nc) {
@@ -1399,8 +1443,16 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1399 goto fail; 1443 goto fail;
1400 } 1444 }
1401 1445
1402 /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ 1446 /* Read our meta data super block early.
1403 drbd_md_set_sector_offsets(mdev, nbc); 1447 * This also sets other on-disk offsets. */
1448 retcode = drbd_md_read(mdev, nbc);
1449 if (retcode != NO_ERROR)
1450 goto fail;
1451
1452 if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
1453 new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1454 if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
1455 new_disk_conf->al_extents = drbd_al_extents_max(nbc);
1404 1456
1405 if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { 1457 if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
1406 dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", 1458 dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
@@ -1416,7 +1468,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1416 min_md_device_sectors = (2<<10); 1468 min_md_device_sectors = (2<<10);
1417 } else { 1469 } else {
1418 max_possible_sectors = DRBD_MAX_SECTORS; 1470 max_possible_sectors = DRBD_MAX_SECTORS;
1419 min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1); 1471 min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
1420 } 1472 }
1421 1473
1422 if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { 1474 if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
@@ -1467,8 +1519,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1467 if (!get_ldev_if_state(mdev, D_ATTACHING)) 1519 if (!get_ldev_if_state(mdev, D_ATTACHING))
1468 goto force_diskless; 1520 goto force_diskless;
1469 1521
1470 drbd_md_set_sector_offsets(mdev, nbc);
1471
1472 if (!mdev->bitmap) { 1522 if (!mdev->bitmap) {
1473 if (drbd_bm_init(mdev)) { 1523 if (drbd_bm_init(mdev)) {
1474 retcode = ERR_NOMEM; 1524 retcode = ERR_NOMEM;
@@ -1476,10 +1526,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1476 } 1526 }
1477 } 1527 }
1478 1528
1479 retcode = drbd_md_read(mdev, nbc);
1480 if (retcode != NO_ERROR)
1481 goto force_diskless_dec;
1482
1483 if (mdev->state.conn < C_CONNECTED && 1529 if (mdev->state.conn < C_CONNECTED &&
1484 mdev->state.role == R_PRIMARY && 1530 mdev->state.role == R_PRIMARY &&
1485 (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { 1531 (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
@@ -2158,8 +2204,11 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool for
2158 return SS_SUCCESS; 2204 return SS_SUCCESS;
2159 case SS_PRIMARY_NOP: 2205 case SS_PRIMARY_NOP:
2160 /* Our state checking code wants to see the peer outdated. */ 2206 /* Our state checking code wants to see the peer outdated. */
2161 rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, 2207 rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0);
2162 pdsk, D_OUTDATED), CS_VERBOSE); 2208
2209 if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */
2210 rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_VERBOSE);
2211
2163 break; 2212 break;
2164 case SS_CW_FAILED_BY_PEER: 2213 case SS_CW_FAILED_BY_PEER:
2165 /* The peer probably wants to see us outdated. */ 2214 /* The peer probably wants to see us outdated. */
@@ -2406,22 +2455,19 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
2406 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); 2455 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
2407 drbd_flush_workqueue(mdev); 2456 drbd_flush_workqueue(mdev);
2408 2457
2409 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); 2458 /* If we happen to be C_STANDALONE R_SECONDARY, just change to
2410 2459 * D_INCONSISTENT, and set all bits in the bitmap. Otherwise,
2411 if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION) 2460 * try to start a resync handshake as sync target for full sync.
2412 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); 2461 */
2413 2462 if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_SECONDARY) {
2414 while (retcode == SS_NEED_CONNECTION) { 2463 retcode = drbd_request_state(mdev, NS(disk, D_INCONSISTENT));
2415 spin_lock_irq(&mdev->tconn->req_lock); 2464 if (retcode >= SS_SUCCESS) {
2416 if (mdev->state.conn < C_CONNECTED) 2465 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write,
2417 retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); 2466 "set_n_write from invalidate", BM_LOCKED_MASK))
2418 spin_unlock_irq(&mdev->tconn->req_lock); 2467 retcode = ERR_IO_MD_DISK;
2419 2468 }
2420 if (retcode != SS_NEED_CONNECTION) 2469 } else
2421 break;
2422
2423 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); 2470 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
2424 }
2425 drbd_resume_io(mdev); 2471 drbd_resume_io(mdev);
2426 2472
2427out: 2473out:
@@ -2475,21 +2521,22 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
2475 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); 2521 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
2476 drbd_flush_workqueue(mdev); 2522 drbd_flush_workqueue(mdev);
2477 2523
2478 retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); 2524 /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits
2479 if (retcode < SS_SUCCESS) { 2525 * in the bitmap. Otherwise, try to start a resync handshake
2480 if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) { 2526 * as sync source for full sync.
2481 /* The peer will get a resync upon connect anyways. 2527 */
2482 * Just make that into a full resync. */ 2528 if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_PRIMARY) {
2483 retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); 2529 /* The peer will get a resync upon connect anyways. Just make that
2484 if (retcode >= SS_SUCCESS) { 2530 into a full resync. */
2485 if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, 2531 retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
2486 "set_n_write from invalidate_peer", 2532 if (retcode >= SS_SUCCESS) {
2487 BM_LOCKED_SET_ALLOWED)) 2533 if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
2488 retcode = ERR_IO_MD_DISK; 2534 "set_n_write from invalidate_peer",
2489 } 2535 BM_LOCKED_SET_ALLOWED))
2490 } else 2536 retcode = ERR_IO_MD_DISK;
2491 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); 2537 }
2492 } 2538 } else
2539 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
2493 drbd_resume_io(mdev); 2540 drbd_resume_io(mdev);
2494 2541
2495out: 2542out:
@@ -3162,6 +3209,7 @@ static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev)
3162 CS_VERBOSE + CS_WAIT_COMPLETE); 3209 CS_VERBOSE + CS_WAIT_COMPLETE);
3163 idr_remove(&mdev->tconn->volumes, mdev->vnr); 3210 idr_remove(&mdev->tconn->volumes, mdev->vnr);
3164 idr_remove(&minors, mdev_to_minor(mdev)); 3211 idr_remove(&minors, mdev_to_minor(mdev));
3212 destroy_workqueue(mdev->submit.wq);
3165 del_gendisk(mdev->vdisk); 3213 del_gendisk(mdev->vdisk);
3166 synchronize_rcu(); 3214 synchronize_rcu();
3167 kref_put(&mdev->kref, &drbd_minor_destroy); 3215 kref_put(&mdev->kref, &drbd_minor_destroy);
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 928adb815b09..bf31d41dbaad 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -313,8 +313,14 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
313 313
314static int drbd_proc_open(struct inode *inode, struct file *file) 314static int drbd_proc_open(struct inode *inode, struct file *file)
315{ 315{
316 if (try_module_get(THIS_MODULE)) 316 int err;
317 return single_open(file, drbd_seq_show, PDE_DATA(inode)); 317
318 if (try_module_get(THIS_MODULE)) {
319 err = single_open(file, drbd_seq_show, PDE_DATA(inode));
320 if (err)
321 module_put(THIS_MODULE);
322 return err;
323 }
318 return -ENODEV; 324 return -ENODEV;
319} 325}
320 326
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 83c5ae0ed56b..4222affff488 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -850,6 +850,7 @@ int drbd_connected(struct drbd_conf *mdev)
850 err = drbd_send_current_state(mdev); 850 err = drbd_send_current_state(mdev);
851 clear_bit(USE_DEGR_WFC_T, &mdev->flags); 851 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
852 clear_bit(RESIZE_PENDING, &mdev->flags); 852 clear_bit(RESIZE_PENDING, &mdev->flags);
853 atomic_set(&mdev->ap_in_flight, 0);
853 mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ 854 mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
854 return err; 855 return err;
855} 856}
@@ -2266,7 +2267,7 @@ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
2266 drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size); 2267 drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size);
2267 peer_req->flags |= EE_CALL_AL_COMPLETE_IO; 2268 peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
2268 peer_req->flags &= ~EE_MAY_SET_IN_SYNC; 2269 peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
2269 drbd_al_begin_io(mdev, &peer_req->i); 2270 drbd_al_begin_io(mdev, &peer_req->i, true);
2270 } 2271 }
2271 2272
2272 err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR); 2273 err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR);
@@ -2662,7 +2663,6 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2662 if (hg == -1 && mdev->state.role == R_PRIMARY) { 2663 if (hg == -1 && mdev->state.role == R_PRIMARY) {
2663 enum drbd_state_rv rv2; 2664 enum drbd_state_rv rv2;
2664 2665
2665 drbd_set_role(mdev, R_SECONDARY, 0);
2666 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, 2666 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2667 * we might be here in C_WF_REPORT_PARAMS which is transient. 2667 * we might be here in C_WF_REPORT_PARAMS which is transient.
2668 * we do not need to wait for the after state change work either. */ 2668 * we do not need to wait for the after state change work either. */
@@ -3993,7 +3993,7 @@ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi)
3993 3993
3994 clear_bit(DISCARD_MY_DATA, &mdev->flags); 3994 clear_bit(DISCARD_MY_DATA, &mdev->flags);
3995 3995
3996 drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ 3996 drbd_md_sync(mdev); /* update connected indicator, la_size_sect, ... */
3997 3997
3998 return 0; 3998 return 0;
3999} 3999}
@@ -4660,8 +4660,8 @@ static int drbd_do_features(struct drbd_tconn *tconn)
4660#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) 4660#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
4661static int drbd_do_auth(struct drbd_tconn *tconn) 4661static int drbd_do_auth(struct drbd_tconn *tconn)
4662{ 4662{
4663 dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); 4663 conn_err(tconn, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
4664 dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); 4664 conn_err(tconn, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
4665 return -1; 4665 return -1;
4666} 4666}
4667#else 4667#else
@@ -5258,9 +5258,11 @@ int drbd_asender(struct drbd_thread *thi)
5258 bool ping_timeout_active = false; 5258 bool ping_timeout_active = false;
5259 struct net_conf *nc; 5259 struct net_conf *nc;
5260 int ping_timeo, tcp_cork, ping_int; 5260 int ping_timeo, tcp_cork, ping_int;
5261 struct sched_param param = { .sched_priority = 2 };
5261 5262
5262 current->policy = SCHED_RR; /* Make this a realtime task! */ 5263 rv = sched_setscheduler(current, SCHED_RR, &param);
5263 current->rt_priority = 2; /* more important than all other tasks */ 5264 if (rv < 0)
5265 conn_err(tconn, "drbd_asender: ERROR set priority, ret=%d\n", rv);
5264 5266
5265 while (get_t_state(thi) == RUNNING) { 5267 while (get_t_state(thi) == RUNNING) {
5266 drbd_thread_current_set_cpu(thi); 5268 drbd_thread_current_set_cpu(thi);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 2b8303ad63c9..c24379ffd4e3 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -34,14 +34,14 @@
34static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size); 34static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size);
35 35
36/* Update disk stats at start of I/O request */ 36/* Update disk stats at start of I/O request */
37static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) 37static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
38{ 38{
39 const int rw = bio_data_dir(bio); 39 const int rw = bio_data_dir(req->master_bio);
40 int cpu; 40 int cpu;
41 cpu = part_stat_lock(); 41 cpu = part_stat_lock();
42 part_round_stats(cpu, &mdev->vdisk->part0); 42 part_round_stats(cpu, &mdev->vdisk->part0);
43 part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); 43 part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
44 part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); 44 part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], req->i.size >> 9);
45 (void) cpu; /* The macro invocations above want the cpu argument, I do not like 45 (void) cpu; /* The macro invocations above want the cpu argument, I do not like
46 the compiler warning about cpu only assigned but never used... */ 46 the compiler warning about cpu only assigned but never used... */
47 part_inc_in_flight(&mdev->vdisk->part0, rw); 47 part_inc_in_flight(&mdev->vdisk->part0, rw);
@@ -263,8 +263,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
263 else 263 else
264 root = &mdev->read_requests; 264 root = &mdev->read_requests;
265 drbd_remove_request_interval(root, req); 265 drbd_remove_request_interval(root, req);
266 } else if (!(s & RQ_POSTPONED)) 266 }
267 D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
268 267
269 /* Before we can signal completion to the upper layers, 268 /* Before we can signal completion to the upper layers,
270 * we may need to close the current transfer log epoch. 269 * we may need to close the current transfer log epoch.
@@ -755,6 +754,11 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
755 D_ASSERT(req->rq_state & RQ_NET_PENDING); 754 D_ASSERT(req->rq_state & RQ_NET_PENDING);
756 mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); 755 mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE);
757 break; 756 break;
757
758 case QUEUE_AS_DRBD_BARRIER:
759 start_new_tl_epoch(mdev->tconn);
760 mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE);
761 break;
758 }; 762 };
759 763
760 return rv; 764 return rv;
@@ -861,8 +865,10 @@ static void maybe_pull_ahead(struct drbd_conf *mdev)
861 bool congested = false; 865 bool congested = false;
862 enum drbd_on_congestion on_congestion; 866 enum drbd_on_congestion on_congestion;
863 867
868 rcu_read_lock();
864 nc = rcu_dereference(tconn->net_conf); 869 nc = rcu_dereference(tconn->net_conf);
865 on_congestion = nc ? nc->on_congestion : OC_BLOCK; 870 on_congestion = nc ? nc->on_congestion : OC_BLOCK;
871 rcu_read_unlock();
866 if (on_congestion == OC_BLOCK || 872 if (on_congestion == OC_BLOCK ||
867 tconn->agreed_pro_version < 96) 873 tconn->agreed_pro_version < 96)
868 return; 874 return;
@@ -956,14 +962,8 @@ static int drbd_process_write_request(struct drbd_request *req)
956 struct drbd_conf *mdev = req->w.mdev; 962 struct drbd_conf *mdev = req->w.mdev;
957 int remote, send_oos; 963 int remote, send_oos;
958 964
959 rcu_read_lock();
960 remote = drbd_should_do_remote(mdev->state); 965 remote = drbd_should_do_remote(mdev->state);
961 if (remote) {
962 maybe_pull_ahead(mdev);
963 remote = drbd_should_do_remote(mdev->state);
964 }
965 send_oos = drbd_should_send_out_of_sync(mdev->state); 966 send_oos = drbd_should_send_out_of_sync(mdev->state);
966 rcu_read_unlock();
967 967
968 /* Need to replicate writes. Unless it is an empty flush, 968 /* Need to replicate writes. Unless it is an empty flush,
969 * which is better mapped to a DRBD P_BARRIER packet, 969 * which is better mapped to a DRBD P_BARRIER packet,
@@ -975,8 +975,8 @@ static int drbd_process_write_request(struct drbd_request *req)
975 /* The only size==0 bios we expect are empty flushes. */ 975 /* The only size==0 bios we expect are empty flushes. */
976 D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH); 976 D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH);
977 if (remote) 977 if (remote)
978 start_new_tl_epoch(mdev->tconn); 978 _req_mod(req, QUEUE_AS_DRBD_BARRIER);
979 return 0; 979 return remote;
980 } 980 }
981 981
982 if (!remote && !send_oos) 982 if (!remote && !send_oos)
@@ -1020,12 +1020,24 @@ drbd_submit_req_private_bio(struct drbd_request *req)
1020 bio_endio(bio, -EIO); 1020 bio_endio(bio, -EIO);
1021} 1021}
1022 1022
1023void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) 1023static void drbd_queue_write(struct drbd_conf *mdev, struct drbd_request *req)
1024{ 1024{
1025 const int rw = bio_rw(bio); 1025 spin_lock(&mdev->submit.lock);
1026 struct bio_and_error m = { NULL, }; 1026 list_add_tail(&req->tl_requests, &mdev->submit.writes);
1027 spin_unlock(&mdev->submit.lock);
1028 queue_work(mdev->submit.wq, &mdev->submit.worker);
1029}
1030
1031/* returns the new drbd_request pointer, if the caller is expected to
1032 * drbd_send_and_submit() it (to save latency), or NULL if we queued the
1033 * request on the submitter thread.
1034 * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
1035 */
1036struct drbd_request *
1037drbd_request_prepare(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
1038{
1039 const int rw = bio_data_dir(bio);
1027 struct drbd_request *req; 1040 struct drbd_request *req;
1028 bool no_remote = false;
1029 1041
1030 /* allocate outside of all locks; */ 1042 /* allocate outside of all locks; */
1031 req = drbd_req_new(mdev, bio); 1043 req = drbd_req_new(mdev, bio);
@@ -1035,7 +1047,7 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
1035 * if user cannot handle io errors, that's not our business. */ 1047 * if user cannot handle io errors, that's not our business. */
1036 dev_err(DEV, "could not kmalloc() req\n"); 1048 dev_err(DEV, "could not kmalloc() req\n");
1037 bio_endio(bio, -ENOMEM); 1049 bio_endio(bio, -ENOMEM);
1038 return; 1050 return ERR_PTR(-ENOMEM);
1039 } 1051 }
1040 req->start_time = start_time; 1052 req->start_time = start_time;
1041 1053
@@ -1044,28 +1056,40 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
1044 req->private_bio = NULL; 1056 req->private_bio = NULL;
1045 } 1057 }
1046 1058
1047 /* For WRITES going to the local disk, grab a reference on the target 1059 /* Update disk stats */
1048 * extent. This waits for any resync activity in the corresponding 1060 _drbd_start_io_acct(mdev, req);
1049 * resync extent to finish, and, if necessary, pulls in the target 1061
1050 * extent into the activity log, which involves further disk io because
1051 * of transactional on-disk meta data updates.
1052 * Empty flushes don't need to go into the activity log, they can only
1053 * flush data for pending writes which are already in there. */
1054 if (rw == WRITE && req->private_bio && req->i.size 1062 if (rw == WRITE && req->private_bio && req->i.size
1055 && !test_bit(AL_SUSPENDED, &mdev->flags)) { 1063 && !test_bit(AL_SUSPENDED, &mdev->flags)) {
1064 if (!drbd_al_begin_io_fastpath(mdev, &req->i)) {
1065 drbd_queue_write(mdev, req);
1066 return NULL;
1067 }
1056 req->rq_state |= RQ_IN_ACT_LOG; 1068 req->rq_state |= RQ_IN_ACT_LOG;
1057 drbd_al_begin_io(mdev, &req->i);
1058 } 1069 }
1059 1070
1071 return req;
1072}
1073
1074static void drbd_send_and_submit(struct drbd_conf *mdev, struct drbd_request *req)
1075{
1076 const int rw = bio_rw(req->master_bio);
1077 struct bio_and_error m = { NULL, };
1078 bool no_remote = false;
1079
1060 spin_lock_irq(&mdev->tconn->req_lock); 1080 spin_lock_irq(&mdev->tconn->req_lock);
1061 if (rw == WRITE) { 1081 if (rw == WRITE) {
1062 /* This may temporarily give up the req_lock, 1082 /* This may temporarily give up the req_lock,
1063 * but will re-aquire it before it returns here. 1083 * but will re-aquire it before it returns here.
1064 * Needs to be before the check on drbd_suspended() */ 1084 * Needs to be before the check on drbd_suspended() */
1065 complete_conflicting_writes(req); 1085 complete_conflicting_writes(req);
1086 /* no more giving up req_lock from now on! */
1087
1088 /* check for congestion, and potentially stop sending
1089 * full data updates, but start sending "dirty bits" only. */
1090 maybe_pull_ahead(mdev);
1066 } 1091 }
1067 1092
1068 /* no more giving up req_lock from now on! */
1069 1093
1070 if (drbd_suspended(mdev)) { 1094 if (drbd_suspended(mdev)) {
1071 /* push back and retry: */ 1095 /* push back and retry: */
@@ -1078,9 +1102,6 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
1078 goto out; 1102 goto out;
1079 } 1103 }
1080 1104
1081 /* Update disk stats */
1082 _drbd_start_io_acct(mdev, req, bio);
1083
1084 /* We fail READ/READA early, if we can not serve it. 1105 /* We fail READ/READA early, if we can not serve it.
1085 * We must do this before req is registered on any lists. 1106 * We must do this before req is registered on any lists.
1086 * Otherwise, drbd_req_complete() will queue failed READ for retry. */ 1107 * Otherwise, drbd_req_complete() will queue failed READ for retry. */
@@ -1137,7 +1158,116 @@ out:
1137 1158
1138 if (m.bio) 1159 if (m.bio)
1139 complete_master_bio(mdev, &m); 1160 complete_master_bio(mdev, &m);
1140 return; 1161}
1162
1163void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
1164{
1165 struct drbd_request *req = drbd_request_prepare(mdev, bio, start_time);
1166 if (IS_ERR_OR_NULL(req))
1167 return;
1168 drbd_send_and_submit(mdev, req);
1169}
1170
1171static void submit_fast_path(struct drbd_conf *mdev, struct list_head *incoming)
1172{
1173 struct drbd_request *req, *tmp;
1174 list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
1175 const int rw = bio_data_dir(req->master_bio);
1176
1177 if (rw == WRITE /* rw != WRITE should not even end up here! */
1178 && req->private_bio && req->i.size
1179 && !test_bit(AL_SUSPENDED, &mdev->flags)) {
1180 if (!drbd_al_begin_io_fastpath(mdev, &req->i))
1181 continue;
1182
1183 req->rq_state |= RQ_IN_ACT_LOG;
1184 }
1185
1186 list_del_init(&req->tl_requests);
1187 drbd_send_and_submit(mdev, req);
1188 }
1189}
1190
1191static bool prepare_al_transaction_nonblock(struct drbd_conf *mdev,
1192 struct list_head *incoming,
1193 struct list_head *pending)
1194{
1195 struct drbd_request *req, *tmp;
1196 int wake = 0;
1197 int err;
1198
1199 spin_lock_irq(&mdev->al_lock);
1200 list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
1201 err = drbd_al_begin_io_nonblock(mdev, &req->i);
1202 if (err == -EBUSY)
1203 wake = 1;
1204 if (err)
1205 continue;
1206 req->rq_state |= RQ_IN_ACT_LOG;
1207 list_move_tail(&req->tl_requests, pending);
1208 }
1209 spin_unlock_irq(&mdev->al_lock);
1210 if (wake)
1211 wake_up(&mdev->al_wait);
1212
1213 return !list_empty(pending);
1214}
1215
1216void do_submit(struct work_struct *ws)
1217{
1218 struct drbd_conf *mdev = container_of(ws, struct drbd_conf, submit.worker);
1219 LIST_HEAD(incoming);
1220 LIST_HEAD(pending);
1221 struct drbd_request *req, *tmp;
1222
1223 for (;;) {
1224 spin_lock(&mdev->submit.lock);
1225 list_splice_tail_init(&mdev->submit.writes, &incoming);
1226 spin_unlock(&mdev->submit.lock);
1227
1228 submit_fast_path(mdev, &incoming);
1229 if (list_empty(&incoming))
1230 break;
1231
1232 wait_event(mdev->al_wait, prepare_al_transaction_nonblock(mdev, &incoming, &pending));
1233 /* Maybe more was queued, while we prepared the transaction?
1234 * Try to stuff them into this transaction as well.
1235 * Be strictly non-blocking here, no wait_event, we already
1236 * have something to commit.
1237 * Stop if we don't make any more progres.
1238 */
1239 for (;;) {
1240 LIST_HEAD(more_pending);
1241 LIST_HEAD(more_incoming);
1242 bool made_progress;
1243
1244 /* It is ok to look outside the lock,
1245 * it's only an optimization anyways */
1246 if (list_empty(&mdev->submit.writes))
1247 break;
1248
1249 spin_lock(&mdev->submit.lock);
1250 list_splice_tail_init(&mdev->submit.writes, &more_incoming);
1251 spin_unlock(&mdev->submit.lock);
1252
1253 if (list_empty(&more_incoming))
1254 break;
1255
1256 made_progress = prepare_al_transaction_nonblock(mdev, &more_incoming, &more_pending);
1257
1258 list_splice_tail_init(&more_pending, &pending);
1259 list_splice_tail_init(&more_incoming, &incoming);
1260
1261 if (!made_progress)
1262 break;
1263 }
1264 drbd_al_begin_io_commit(mdev, false);
1265
1266 list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
1267 list_del_init(&req->tl_requests);
1268 drbd_send_and_submit(mdev, req);
1269 }
1270 }
1141} 1271}
1142 1272
1143void drbd_make_request(struct request_queue *q, struct bio *bio) 1273void drbd_make_request(struct request_queue *q, struct bio *bio)
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index c08d22964d06..978cb1addc98 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -88,6 +88,14 @@ enum drbd_req_event {
88 QUEUE_FOR_NET_READ, 88 QUEUE_FOR_NET_READ,
89 QUEUE_FOR_SEND_OOS, 89 QUEUE_FOR_SEND_OOS,
90 90
91 /* An empty flush is queued as P_BARRIER,
92 * which will cause it to complete "successfully",
93 * even if the local disk flush failed.
94 *
95 * Just like "real" requests, empty flushes (blkdev_issue_flush()) will
96 * only see an error if neither local nor remote data is reachable. */
97 QUEUE_AS_DRBD_BARRIER,
98
91 SEND_CANCELED, 99 SEND_CANCELED,
92 SEND_FAILED, 100 SEND_FAILED,
93 HANDED_OVER_TO_NETWORK, 101 HANDED_OVER_TO_NETWORK,
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 0fe220cfb9e9..90c5be2b1d30 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -570,6 +570,13 @@ is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
570 mdev->tconn->agreed_pro_version < 88) 570 mdev->tconn->agreed_pro_version < 88)
571 rv = SS_NOT_SUPPORTED; 571 rv = SS_NOT_SUPPORTED;
572 572
573 else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
574 rv = SS_NO_UP_TO_DATE_DISK;
575
576 else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
577 ns.pdsk == D_UNKNOWN)
578 rv = SS_NEED_CONNECTION;
579
573 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) 580 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
574 rv = SS_CONNECTED_OUTDATES; 581 rv = SS_CONNECTED_OUTDATES;
575 582
@@ -635,6 +642,10 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_t
635 && os.conn < C_WF_REPORT_PARAMS) 642 && os.conn < C_WF_REPORT_PARAMS)
636 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ 643 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
637 644
645 if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED &&
646 os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)
647 rv = SS_OUTDATE_WO_CONN;
648
638 return rv; 649 return rv;
639} 650}
640 651
@@ -1377,13 +1388,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1377 &drbd_bmio_set_n_write, &abw_start_sync, 1388 &drbd_bmio_set_n_write, &abw_start_sync,
1378 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); 1389 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
1379 1390
1380 /* We are invalidating our self... */
1381 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1382 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1383 /* other bitmap operation expected during this phase */
1384 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1385 "set_n_write from invalidate", BM_LOCKED_MASK);
1386
1387 /* first half of local IO error, failure to attach, 1391 /* first half of local IO error, failure to attach,
1388 * or administrative detach */ 1392 * or administrative detach */
1389 if (os.disk != D_FAILED && ns.disk == D_FAILED) { 1393 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
@@ -1748,13 +1752,9 @@ _conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state
1748 if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags)) 1752 if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags))
1749 return SS_CW_FAILED_BY_PEER; 1753 return SS_CW_FAILED_BY_PEER;
1750 1754
1751 rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR; 1755 rv = conn_is_valid_transition(tconn, mask, val, 0);
1752 1756 if (rv == SS_SUCCESS && tconn->cstate == C_WF_REPORT_PARAMS)
1753 if (rv == SS_UNKNOWN_ERROR) 1757 rv = SS_UNKNOWN_ERROR; /* continue waiting */
1754 rv = conn_is_valid_transition(tconn, mask, val, 0);
1755
1756 if (rv == SS_SUCCESS)
1757 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
1758 1758
1759 return rv; 1759 return rv;
1760} 1760}
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
index 9a664bd27404..58e08ff2b2ce 100644
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = {
89 [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", 89 [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
90 [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", 90 [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
91 [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", 91 [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
92 [-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer",
92 [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config", 93 [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
93}; 94};
94 95
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 424dc7bdf9b7..891c0ecaa292 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -89,7 +89,8 @@ void drbd_md_io_complete(struct bio *bio, int error)
89 md_io->done = 1; 89 md_io->done = 1;
90 wake_up(&mdev->misc_wait); 90 wake_up(&mdev->misc_wait);
91 bio_put(bio); 91 bio_put(bio);
92 put_ldev(mdev); 92 if (mdev->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
93 put_ldev(mdev);
93} 94}
94 95
95/* reads on behalf of the partner, 96/* reads on behalf of the partner,
@@ -1410,7 +1411,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel)
1410 struct drbd_conf *mdev = w->mdev; 1411 struct drbd_conf *mdev = w->mdev;
1411 1412
1412 if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) 1413 if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
1413 drbd_al_begin_io(mdev, &req->i); 1414 drbd_al_begin_io(mdev, &req->i, false);
1414 1415
1415 drbd_req_make_private_bio(req, req->master_bio); 1416 drbd_req_make_private_bio(req, req->master_bio);
1416 req->private_bio->bi_bdev = mdev->ldev->backing_bdev; 1417 req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
@@ -1425,7 +1426,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev)
1425 int resync_after; 1426 int resync_after;
1426 1427
1427 while (1) { 1428 while (1) {
1428 if (!odev->ldev) 1429 if (!odev->ldev || odev->state.disk == D_DISKLESS)
1429 return 1; 1430 return 1;
1430 rcu_read_lock(); 1431 rcu_read_lock();
1431 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; 1432 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
@@ -1433,7 +1434,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev)
1433 if (resync_after == -1) 1434 if (resync_after == -1)
1434 return 1; 1435 return 1;
1435 odev = minor_to_mdev(resync_after); 1436 odev = minor_to_mdev(resync_after);
1436 if (!expect(odev)) 1437 if (!odev)
1437 return 1; 1438 return 1;
1438 if ((odev->state.conn >= C_SYNC_SOURCE && 1439 if ((odev->state.conn >= C_SYNC_SOURCE &&
1439 odev->state.conn <= C_PAUSED_SYNC_T) || 1440 odev->state.conn <= C_PAUSED_SYNC_T) ||
@@ -1515,7 +1516,7 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor)
1515 1516
1516 if (o_minor == -1) 1517 if (o_minor == -1)
1517 return NO_ERROR; 1518 return NO_ERROR;
1518 if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) 1519 if (o_minor < -1 || o_minor > MINORMASK)
1519 return ERR_RESYNC_AFTER; 1520 return ERR_RESYNC_AFTER;
1520 1521
1521 /* check for loops */ 1522 /* check for loops */
@@ -1524,6 +1525,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor)
1524 if (odev == mdev) 1525 if (odev == mdev)
1525 return ERR_RESYNC_AFTER_CYCLE; 1526 return ERR_RESYNC_AFTER_CYCLE;
1526 1527
1528 /* You are free to depend on diskless, non-existing,
1529 * or not yet/no longer existing minors.
1530 * We only reject dependency loops.
1531 * We cannot follow the dependency chain beyond a detached or
1532 * missing minor.
1533 */
1534 if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
1535 return NO_ERROR;
1536
1527 rcu_read_lock(); 1537 rcu_read_lock();
1528 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; 1538 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
1529 rcu_read_unlock(); 1539 rcu_read_unlock();
@@ -1652,7 +1662,9 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1652 clear_bit(B_RS_H_DONE, &mdev->flags); 1662 clear_bit(B_RS_H_DONE, &mdev->flags);
1653 1663
1654 write_lock_irq(&global_state_lock); 1664 write_lock_irq(&global_state_lock);
1655 if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { 1665 /* Did some connection breakage or IO error race with us? */
1666 if (mdev->state.conn < C_CONNECTED
1667 || !get_ldev_if_state(mdev, D_NEGOTIATING)) {
1656 write_unlock_irq(&global_state_lock); 1668 write_unlock_irq(&global_state_lock);
1657 mutex_unlock(mdev->state_mutex); 1669 mutex_unlock(mdev->state_mutex);
1658 return; 1670 return;
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 076ae7f1b781..a56cfcd5d648 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -780,6 +780,7 @@ static const struct block_device_operations mg_disk_ops = {
780 .getgeo = mg_getgeo 780 .getgeo = mg_getgeo
781}; 781};
782 782
783#ifdef CONFIG_PM_SLEEP
783static int mg_suspend(struct device *dev) 784static int mg_suspend(struct device *dev)
784{ 785{
785 struct mg_drv_data *prv_data = dev->platform_data; 786 struct mg_drv_data *prv_data = dev->platform_data;
@@ -824,6 +825,7 @@ static int mg_resume(struct device *dev)
824 825
825 return 0; 826 return 0;
826} 827}
828#endif
827 829
828static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume); 830static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume);
829 831
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 32c678028e53..847107ef0cce 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -728,7 +728,10 @@ static void mtip_async_complete(struct mtip_port *port,
728 atomic_set(&port->commands[tag].active, 0); 728 atomic_set(&port->commands[tag].active, 0);
729 release_slot(port, tag); 729 release_slot(port, tag);
730 730
731 up(&port->cmd_slot); 731 if (unlikely(command->unaligned))
732 up(&port->cmd_slot_unal);
733 else
734 up(&port->cmd_slot);
732} 735}
733 736
734/* 737/*
@@ -1560,10 +1563,12 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
1560 } 1563 }
1561#endif 1564#endif
1562 1565
1566#ifdef MTIP_TRIM /* Disabling TRIM support temporarily */
1563 /* Demux ID.DRAT & ID.RZAT to determine trim support */ 1567 /* Demux ID.DRAT & ID.RZAT to determine trim support */
1564 if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5)) 1568 if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5))
1565 port->dd->trim_supp = true; 1569 port->dd->trim_supp = true;
1566 else 1570 else
1571#endif
1567 port->dd->trim_supp = false; 1572 port->dd->trim_supp = false;
1568 1573
1569 /* Set the identify buffer as valid. */ 1574 /* Set the identify buffer as valid. */
@@ -2557,7 +2562,7 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
2557 */ 2562 */
2558static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, 2563static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
2559 int nsect, int nents, int tag, void *callback, 2564 int nsect, int nents, int tag, void *callback,
2560 void *data, int dir) 2565 void *data, int dir, int unaligned)
2561{ 2566{
2562 struct host_to_dev_fis *fis; 2567 struct host_to_dev_fis *fis;
2563 struct mtip_port *port = dd->port; 2568 struct mtip_port *port = dd->port;
@@ -2570,6 +2575,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
2570 2575
2571 command->scatter_ents = nents; 2576 command->scatter_ents = nents;
2572 2577
2578 command->unaligned = unaligned;
2573 /* 2579 /*
2574 * The number of retries for this command before it is 2580 * The number of retries for this command before it is
2575 * reported as a failure to the upper layers. 2581 * reported as a failure to the upper layers.
@@ -2598,6 +2604,9 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
2598 fis->res3 = 0; 2604 fis->res3 = 0;
2599 fill_command_sg(dd, command, nents); 2605 fill_command_sg(dd, command, nents);
2600 2606
2607 if (unaligned)
2608 fis->device |= 1 << 7;
2609
2601 /* Populate the command header */ 2610 /* Populate the command header */
2602 command->command_header->opts = 2611 command->command_header->opts =
2603 __force_bit2int cpu_to_le32( 2612 __force_bit2int cpu_to_le32(
@@ -2644,9 +2653,13 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
2644 * return value 2653 * return value
2645 * None 2654 * None
2646 */ 2655 */
2647static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag) 2656static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag,
2657 int unaligned)
2648{ 2658{
2659 struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal :
2660 &dd->port->cmd_slot;
2649 release_slot(dd->port, tag); 2661 release_slot(dd->port, tag);
2662 up(sem);
2650} 2663}
2651 2664
2652/* 2665/*
@@ -2661,22 +2674,25 @@ static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag)
2661 * or NULL if no command slots are available. 2674 * or NULL if no command slots are available.
2662 */ 2675 */
2663static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd, 2676static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd,
2664 int *tag) 2677 int *tag, int unaligned)
2665{ 2678{
2679 struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal :
2680 &dd->port->cmd_slot;
2681
2666 /* 2682 /*
2667 * It is possible that, even with this semaphore, a thread 2683 * It is possible that, even with this semaphore, a thread
2668 * may think that no command slots are available. Therefore, we 2684 * may think that no command slots are available. Therefore, we
2669 * need to make an attempt to get_slot(). 2685 * need to make an attempt to get_slot().
2670 */ 2686 */
2671 down(&dd->port->cmd_slot); 2687 down(sem);
2672 *tag = get_slot(dd->port); 2688 *tag = get_slot(dd->port);
2673 2689
2674 if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { 2690 if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) {
2675 up(&dd->port->cmd_slot); 2691 up(sem);
2676 return NULL; 2692 return NULL;
2677 } 2693 }
2678 if (unlikely(*tag < 0)) { 2694 if (unlikely(*tag < 0)) {
2679 up(&dd->port->cmd_slot); 2695 up(sem);
2680 return NULL; 2696 return NULL;
2681 } 2697 }
2682 2698
@@ -3010,6 +3026,11 @@ static inline void hba_setup(struct driver_data *dd)
3010 dd->mmio + HOST_HSORG); 3026 dd->mmio + HOST_HSORG);
3011} 3027}
3012 3028
3029static int mtip_device_unaligned_constrained(struct driver_data *dd)
3030{
3031 return (dd->pdev->device == P420M_DEVICE_ID ? 1 : 0);
3032}
3033
3013/* 3034/*
3014 * Detect the details of the product, and store anything needed 3035 * Detect the details of the product, and store anything needed
3015 * into the driver data structure. This includes product type and 3036 * into the driver data structure. This includes product type and
@@ -3232,8 +3253,15 @@ static int mtip_hw_init(struct driver_data *dd)
3232 for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) 3253 for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
3233 dd->work[i].port = dd->port; 3254 dd->work[i].port = dd->port;
3234 3255
3256 /* Enable unaligned IO constraints for some devices */
3257 if (mtip_device_unaligned_constrained(dd))
3258 dd->unal_qdepth = MTIP_MAX_UNALIGNED_SLOTS;
3259 else
3260 dd->unal_qdepth = 0;
3261
3235 /* Counting semaphore to track command slot usage */ 3262 /* Counting semaphore to track command slot usage */
3236 sema_init(&dd->port->cmd_slot, num_command_slots - 1); 3263 sema_init(&dd->port->cmd_slot, num_command_slots - 1 - dd->unal_qdepth);
3264 sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth);
3237 3265
3238 /* Spinlock to prevent concurrent issue */ 3266 /* Spinlock to prevent concurrent issue */
3239 for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) 3267 for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
@@ -3836,7 +3864,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
3836 struct scatterlist *sg; 3864 struct scatterlist *sg;
3837 struct bio_vec *bvec; 3865 struct bio_vec *bvec;
3838 int nents = 0; 3866 int nents = 0;
3839 int tag = 0; 3867 int tag = 0, unaligned = 0;
3840 3868
3841 if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { 3869 if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) {
3842 if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, 3870 if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
@@ -3872,7 +3900,15 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
3872 return; 3900 return;
3873 } 3901 }
3874 3902
3875 sg = mtip_hw_get_scatterlist(dd, &tag); 3903 if (bio_data_dir(bio) == WRITE && bio_sectors(bio) <= 64 &&
3904 dd->unal_qdepth) {
3905 if (bio->bi_sector % 8 != 0) /* Unaligned on 4k boundaries */
3906 unaligned = 1;
3907 else if (bio_sectors(bio) % 8 != 0) /* Aligned but not 4k/8k */
3908 unaligned = 1;
3909 }
3910
3911 sg = mtip_hw_get_scatterlist(dd, &tag, unaligned);
3876 if (likely(sg != NULL)) { 3912 if (likely(sg != NULL)) {
3877 blk_queue_bounce(queue, &bio); 3913 blk_queue_bounce(queue, &bio);
3878 3914
@@ -3880,7 +3916,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
3880 dev_warn(&dd->pdev->dev, 3916 dev_warn(&dd->pdev->dev,
3881 "Maximum number of SGL entries exceeded\n"); 3917 "Maximum number of SGL entries exceeded\n");
3882 bio_io_error(bio); 3918 bio_io_error(bio);
3883 mtip_hw_release_scatterlist(dd, tag); 3919 mtip_hw_release_scatterlist(dd, tag, unaligned);
3884 return; 3920 return;
3885 } 3921 }
3886 3922
@@ -3900,7 +3936,8 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
3900 tag, 3936 tag,
3901 bio_endio, 3937 bio_endio,
3902 bio, 3938 bio,
3903 bio_data_dir(bio)); 3939 bio_data_dir(bio),
3940 unaligned);
3904 } else 3941 } else
3905 bio_io_error(bio); 3942 bio_io_error(bio);
3906} 3943}
@@ -4156,26 +4193,24 @@ static int mtip_block_remove(struct driver_data *dd)
4156 */ 4193 */
4157static int mtip_block_shutdown(struct driver_data *dd) 4194static int mtip_block_shutdown(struct driver_data *dd)
4158{ 4195{
4159 dev_info(&dd->pdev->dev,
4160 "Shutting down %s ...\n", dd->disk->disk_name);
4161
4162 /* Delete our gendisk structure, and cleanup the blk queue. */ 4196 /* Delete our gendisk structure, and cleanup the blk queue. */
4163 if (dd->disk) { 4197 if (dd->disk) {
4164 if (dd->disk->queue) 4198 dev_info(&dd->pdev->dev,
4199 "Shutting down %s ...\n", dd->disk->disk_name);
4200
4201 if (dd->disk->queue) {
4165 del_gendisk(dd->disk); 4202 del_gendisk(dd->disk);
4166 else 4203 blk_cleanup_queue(dd->queue);
4204 } else
4167 put_disk(dd->disk); 4205 put_disk(dd->disk);
4206 dd->disk = NULL;
4207 dd->queue = NULL;
4168 } 4208 }
4169 4209
4170
4171 spin_lock(&rssd_index_lock); 4210 spin_lock(&rssd_index_lock);
4172 ida_remove(&rssd_index_ida, dd->index); 4211 ida_remove(&rssd_index_ida, dd->index);
4173 spin_unlock(&rssd_index_lock); 4212 spin_unlock(&rssd_index_lock);
4174 4213
4175 blk_cleanup_queue(dd->queue);
4176 dd->disk = NULL;
4177 dd->queue = NULL;
4178
4179 mtip_hw_shutdown(dd); 4214 mtip_hw_shutdown(dd);
4180 return 0; 4215 return 0;
4181} 4216}
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 8e8334c9dd0f..3bb8a295fbe4 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -52,6 +52,9 @@
52#define MTIP_FTL_REBUILD_MAGIC 0xED51 52#define MTIP_FTL_REBUILD_MAGIC 0xED51
53#define MTIP_FTL_REBUILD_TIMEOUT_MS 2400000 53#define MTIP_FTL_REBUILD_TIMEOUT_MS 2400000
54 54
55/* unaligned IO handling */
56#define MTIP_MAX_UNALIGNED_SLOTS 8
57
55/* Macro to extract the tag bit number from a tag value. */ 58/* Macro to extract the tag bit number from a tag value. */
56#define MTIP_TAG_BIT(tag) (tag & 0x1F) 59#define MTIP_TAG_BIT(tag) (tag & 0x1F)
57 60
@@ -333,6 +336,8 @@ struct mtip_cmd {
333 336
334 int scatter_ents; /* Number of scatter list entries used */ 337 int scatter_ents; /* Number of scatter list entries used */
335 338
339 int unaligned; /* command is unaligned on 4k boundary */
340
336 struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */ 341 struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
337 342
338 int retries; /* The number of retries left for this command. */ 343 int retries; /* The number of retries left for this command. */
@@ -452,6 +457,10 @@ struct mtip_port {
452 * command slots available. 457 * command slots available.
453 */ 458 */
454 struct semaphore cmd_slot; 459 struct semaphore cmd_slot;
460
461 /* Semaphore to control queue depth of unaligned IOs */
462 struct semaphore cmd_slot_unal;
463
455 /* Spinlock for working around command-issue bug. */ 464 /* Spinlock for working around command-issue bug. */
456 spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS]; 465 spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS];
457}; 466};
@@ -502,6 +511,8 @@ struct driver_data {
502 511
503 int isr_binding; 512 int isr_binding;
504 513
514 int unal_qdepth; /* qdepth of unaligned IO queue */
515
505 struct list_head online_list; /* linkage for online list */ 516 struct list_head online_list; /* linkage for online list */
506 517
507 struct list_head remove_list; /* linkage for removing list */ 518 struct list_head remove_list; /* linkage for removing list */
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 4d8d90b4fe78..3bfc8f1da9fe 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -174,6 +174,8 @@ config MD_FAULTY
174 174
175 In unsure, say N. 175 In unsure, say N.
176 176
177source "drivers/md/bcache/Kconfig"
178
177config BLK_DEV_DM 179config BLK_DEV_DM
178 tristate "Device mapper support" 180 tristate "Device mapper support"
179 ---help--- 181 ---help---
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 7ceeaefc0e95..1439fd4ad9b1 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_MD_RAID10) += raid10.o
29obj-$(CONFIG_MD_RAID456) += raid456.o 29obj-$(CONFIG_MD_RAID456) += raid456.o
30obj-$(CONFIG_MD_MULTIPATH) += multipath.o 30obj-$(CONFIG_MD_MULTIPATH) += multipath.o
31obj-$(CONFIG_MD_FAULTY) += faulty.o 31obj-$(CONFIG_MD_FAULTY) += faulty.o
32obj-$(CONFIG_BCACHE) += bcache/
32obj-$(CONFIG_BLK_DEV_MD) += md-mod.o 33obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
33obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o 34obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
34obj-$(CONFIG_DM_BUFIO) += dm-bufio.o 35obj-$(CONFIG_DM_BUFIO) += dm-bufio.o
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
new file mode 100644
index 000000000000..05c220d05e23
--- /dev/null
+++ b/drivers/md/bcache/Kconfig
@@ -0,0 +1,42 @@
1
2config BCACHE
3 tristate "Block device as cache"
4 select CLOSURES
5 ---help---
6 Allows a block device to be used as cache for other devices; uses
7 a btree for indexing and the layout is optimized for SSDs.
8
9 See Documentation/bcache.txt for details.
10
11config BCACHE_DEBUG
12 bool "Bcache debugging"
13 depends on BCACHE
14 ---help---
15 Don't select this option unless you're a developer
16
17 Enables extra debugging tools (primarily a fuzz tester)
18
19config BCACHE_EDEBUG
20 bool "Extended runtime checks"
21 depends on BCACHE
22 ---help---
23 Don't select this option unless you're a developer
24
25 Enables extra runtime checks which significantly affect performance
26
27config BCACHE_CLOSURES_DEBUG
28 bool "Debug closures"
29 depends on BCACHE
30 select DEBUG_FS
31 ---help---
32 Keeps all active closures in a linked list and provides a debugfs
33 interface to list them, which makes it possible to see asynchronous
34 operations that get stuck.
35
36# cgroup code needs to be updated:
37#
38#config CGROUP_BCACHE
39# bool "Cgroup controls for bcache"
40# depends on BCACHE && BLK_CGROUP
41# ---help---
42# TODO
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
new file mode 100644
index 000000000000..0e9c82523be6
--- /dev/null
+++ b/drivers/md/bcache/Makefile
@@ -0,0 +1,7 @@
1
2obj-$(CONFIG_BCACHE) += bcache.o
3
4bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\
5 movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o
6
7CFLAGS_request.o += -Iblock
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
new file mode 100644
index 000000000000..048f2947e08b
--- /dev/null
+++ b/drivers/md/bcache/alloc.c
@@ -0,0 +1,599 @@
1/*
2 * Primary bucket allocation code
3 *
4 * Copyright 2012 Google, Inc.
5 *
6 * Allocation in bcache is done in terms of buckets:
7 *
8 * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
9 * btree pointers - they must match for the pointer to be considered valid.
10 *
11 * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
12 * bucket simply by incrementing its gen.
13 *
14 * The gens (along with the priorities; it's really the gens are important but
15 * the code is named as if it's the priorities) are written in an arbitrary list
16 * of buckets on disk, with a pointer to them in the journal header.
17 *
18 * When we invalidate a bucket, we have to write its new gen to disk and wait
19 * for that write to complete before we use it - otherwise after a crash we
20 * could have pointers that appeared to be good but pointed to data that had
21 * been overwritten.
22 *
23 * Since the gens and priorities are all stored contiguously on disk, we can
24 * batch this up: We fill up the free_inc list with freshly invalidated buckets,
25 * call prio_write(), and when prio_write() finishes we pull buckets off the
26 * free_inc list and optionally discard them.
27 *
28 * free_inc isn't the only freelist - if it was, we'd often to sleep while
29 * priorities and gens were being written before we could allocate. c->free is a
30 * smaller freelist, and buckets on that list are always ready to be used.
31 *
32 * If we've got discards enabled, that happens when a bucket moves from the
33 * free_inc list to the free list.
34 *
35 * There is another freelist, because sometimes we have buckets that we know
36 * have nothing pointing into them - these we can reuse without waiting for
37 * priorities to be rewritten. These come from freed btree nodes and buckets
38 * that garbage collection discovered no longer had valid keys pointing into
39 * them (because they were overwritten). That's the unused list - buckets on the
40 * unused list move to the free list, optionally being discarded in the process.
41 *
42 * It's also important to ensure that gens don't wrap around - with respect to
43 * either the oldest gen in the btree or the gen on disk. This is quite
44 * difficult to do in practice, but we explicitly guard against it anyways - if
45 * a bucket is in danger of wrapping around we simply skip invalidating it that
46 * time around, and we garbage collect or rewrite the priorities sooner than we
47 * would have otherwise.
48 *
49 * bch_bucket_alloc() allocates a single bucket from a specific cache.
50 *
51 * bch_bucket_alloc_set() allocates one or more buckets from different caches
52 * out of a cache set.
53 *
54 * free_some_buckets() drives all the processes described above. It's called
55 * from bch_bucket_alloc() and a few other places that need to make sure free
56 * buckets are ready.
57 *
58 * invalidate_buckets_(lru|fifo)() find buckets that are available to be
59 * invalidated, and then invalidate them and stick them on the free_inc list -
60 * in either lru or fifo order.
61 */
62
63#include "bcache.h"
64#include "btree.h"
65
66#include <linux/random.h>
67
68#define MAX_IN_FLIGHT_DISCARDS 8U
69
70/* Bucket heap / gen */
71
72uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
73{
74 uint8_t ret = ++b->gen;
75
76 ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
77 WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
78
79 if (CACHE_SYNC(&ca->set->sb)) {
80 ca->need_save_prio = max(ca->need_save_prio,
81 bucket_disk_gen(b));
82 WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
83 }
84
85 return ret;
86}
87
88void bch_rescale_priorities(struct cache_set *c, int sectors)
89{
90 struct cache *ca;
91 struct bucket *b;
92 unsigned next = c->nbuckets * c->sb.bucket_size / 1024;
93 unsigned i;
94 int r;
95
96 atomic_sub(sectors, &c->rescale);
97
98 do {
99 r = atomic_read(&c->rescale);
100
101 if (r >= 0)
102 return;
103 } while (atomic_cmpxchg(&c->rescale, r, r + next) != r);
104
105 mutex_lock(&c->bucket_lock);
106
107 c->min_prio = USHRT_MAX;
108
109 for_each_cache(ca, c, i)
110 for_each_bucket(b, ca)
111 if (b->prio &&
112 b->prio != BTREE_PRIO &&
113 !atomic_read(&b->pin)) {
114 b->prio--;
115 c->min_prio = min(c->min_prio, b->prio);
116 }
117
118 mutex_unlock(&c->bucket_lock);
119}
120
121/* Discard/TRIM */
122
123struct discard {
124 struct list_head list;
125 struct work_struct work;
126 struct cache *ca;
127 long bucket;
128
129 struct bio bio;
130 struct bio_vec bv;
131};
132
133static void discard_finish(struct work_struct *w)
134{
135 struct discard *d = container_of(w, struct discard, work);
136 struct cache *ca = d->ca;
137 char buf[BDEVNAME_SIZE];
138
139 if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) {
140 pr_notice("discard error on %s, disabling",
141 bdevname(ca->bdev, buf));
142 d->ca->discard = 0;
143 }
144
145 mutex_lock(&ca->set->bucket_lock);
146
147 fifo_push(&ca->free, d->bucket);
148 list_add(&d->list, &ca->discards);
149 atomic_dec(&ca->discards_in_flight);
150
151 mutex_unlock(&ca->set->bucket_lock);
152
153 closure_wake_up(&ca->set->bucket_wait);
154 wake_up(&ca->set->alloc_wait);
155
156 closure_put(&ca->set->cl);
157}
158
159static void discard_endio(struct bio *bio, int error)
160{
161 struct discard *d = container_of(bio, struct discard, bio);
162 schedule_work(&d->work);
163}
164
165static void do_discard(struct cache *ca, long bucket)
166{
167 struct discard *d = list_first_entry(&ca->discards,
168 struct discard, list);
169
170 list_del(&d->list);
171 d->bucket = bucket;
172
173 atomic_inc(&ca->discards_in_flight);
174 closure_get(&ca->set->cl);
175
176 bio_init(&d->bio);
177
178 d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket);
179 d->bio.bi_bdev = ca->bdev;
180 d->bio.bi_rw = REQ_WRITE|REQ_DISCARD;
181 d->bio.bi_max_vecs = 1;
182 d->bio.bi_io_vec = d->bio.bi_inline_vecs;
183 d->bio.bi_size = bucket_bytes(ca);
184 d->bio.bi_end_io = discard_endio;
185 bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
186
187 submit_bio(0, &d->bio);
188}
189
190/* Allocation */
191
192static inline bool can_inc_bucket_gen(struct bucket *b)
193{
194 return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
195 bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
196}
197
198bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
199{
200 BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
201
202 if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
203 CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
204 return false;
205
206 b->prio = 0;
207
208 if (can_inc_bucket_gen(b) &&
209 fifo_push(&ca->unused, b - ca->buckets)) {
210 atomic_inc(&b->pin);
211 return true;
212 }
213
214 return false;
215}
216
217static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
218{
219 return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
220 !atomic_read(&b->pin) &&
221 can_inc_bucket_gen(b);
222}
223
224static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
225{
226 bch_inc_gen(ca, b);
227 b->prio = INITIAL_PRIO;
228 atomic_inc(&b->pin);
229 fifo_push(&ca->free_inc, b - ca->buckets);
230}
231
232#define bucket_prio(b) \
233 (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b))
234
235#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r))
236#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r))
237
238static void invalidate_buckets_lru(struct cache *ca)
239{
240 struct bucket *b;
241 ssize_t i;
242
243 ca->heap.used = 0;
244
245 for_each_bucket(b, ca) {
246 /*
247 * If we fill up the unused list, if we then return before
248 * adding anything to the free_inc list we'll skip writing
249 * prios/gens and just go back to allocating from the unused
250 * list:
251 */
252 if (fifo_full(&ca->unused))
253 return;
254
255 if (!can_invalidate_bucket(ca, b))
256 continue;
257
258 if (!GC_SECTORS_USED(b) &&
259 bch_bucket_add_unused(ca, b))
260 continue;
261
262 if (!heap_full(&ca->heap))
263 heap_add(&ca->heap, b, bucket_max_cmp);
264 else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
265 ca->heap.data[0] = b;
266 heap_sift(&ca->heap, 0, bucket_max_cmp);
267 }
268 }
269
270 for (i = ca->heap.used / 2 - 1; i >= 0; --i)
271 heap_sift(&ca->heap, i, bucket_min_cmp);
272
273 while (!fifo_full(&ca->free_inc)) {
274 if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
275 /*
276 * We don't want to be calling invalidate_buckets()
277 * multiple times when it can't do anything
278 */
279 ca->invalidate_needs_gc = 1;
280 bch_queue_gc(ca->set);
281 return;
282 }
283
284 invalidate_one_bucket(ca, b);
285 }
286}
287
288static void invalidate_buckets_fifo(struct cache *ca)
289{
290 struct bucket *b;
291 size_t checked = 0;
292
293 while (!fifo_full(&ca->free_inc)) {
294 if (ca->fifo_last_bucket < ca->sb.first_bucket ||
295 ca->fifo_last_bucket >= ca->sb.nbuckets)
296 ca->fifo_last_bucket = ca->sb.first_bucket;
297
298 b = ca->buckets + ca->fifo_last_bucket++;
299
300 if (can_invalidate_bucket(ca, b))
301 invalidate_one_bucket(ca, b);
302
303 if (++checked >= ca->sb.nbuckets) {
304 ca->invalidate_needs_gc = 1;
305 bch_queue_gc(ca->set);
306 return;
307 }
308 }
309}
310
311static void invalidate_buckets_random(struct cache *ca)
312{
313 struct bucket *b;
314 size_t checked = 0;
315
316 while (!fifo_full(&ca->free_inc)) {
317 size_t n;
318 get_random_bytes(&n, sizeof(n));
319
320 n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket);
321 n += ca->sb.first_bucket;
322
323 b = ca->buckets + n;
324
325 if (can_invalidate_bucket(ca, b))
326 invalidate_one_bucket(ca, b);
327
328 if (++checked >= ca->sb.nbuckets / 2) {
329 ca->invalidate_needs_gc = 1;
330 bch_queue_gc(ca->set);
331 return;
332 }
333 }
334}
335
336static void invalidate_buckets(struct cache *ca)
337{
338 if (ca->invalidate_needs_gc)
339 return;
340
341 switch (CACHE_REPLACEMENT(&ca->sb)) {
342 case CACHE_REPLACEMENT_LRU:
343 invalidate_buckets_lru(ca);
344 break;
345 case CACHE_REPLACEMENT_FIFO:
346 invalidate_buckets_fifo(ca);
347 break;
348 case CACHE_REPLACEMENT_RANDOM:
349 invalidate_buckets_random(ca);
350 break;
351 }
352
353 pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu",
354 fifo_used(&ca->free), ca->free.size,
355 fifo_used(&ca->free_inc), ca->free_inc.size,
356 fifo_used(&ca->unused), ca->unused.size);
357}
358
359#define allocator_wait(ca, cond) \
360do { \
361 DEFINE_WAIT(__wait); \
362 \
363 while (1) { \
364 prepare_to_wait(&ca->set->alloc_wait, \
365 &__wait, TASK_INTERRUPTIBLE); \
366 if (cond) \
367 break; \
368 \
369 mutex_unlock(&(ca)->set->bucket_lock); \
370 if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \
371 finish_wait(&ca->set->alloc_wait, &__wait); \
372 closure_return(cl); \
373 } \
374 \
375 schedule(); \
376 mutex_lock(&(ca)->set->bucket_lock); \
377 } \
378 \
379 finish_wait(&ca->set->alloc_wait, &__wait); \
380} while (0)
381
382void bch_allocator_thread(struct closure *cl)
383{
384 struct cache *ca = container_of(cl, struct cache, alloc);
385
386 mutex_lock(&ca->set->bucket_lock);
387
388 while (1) {
389 /*
390 * First, we pull buckets off of the unused and free_inc lists,
391 * possibly issue discards to them, then we add the bucket to
392 * the free list:
393 */
394 while (1) {
395 long bucket;
396
397 if ((!atomic_read(&ca->set->prio_blocked) ||
398 !CACHE_SYNC(&ca->set->sb)) &&
399 !fifo_empty(&ca->unused))
400 fifo_pop(&ca->unused, bucket);
401 else if (!fifo_empty(&ca->free_inc))
402 fifo_pop(&ca->free_inc, bucket);
403 else
404 break;
405
406 allocator_wait(ca, (int) fifo_free(&ca->free) >
407 atomic_read(&ca->discards_in_flight));
408
409 if (ca->discard) {
410 allocator_wait(ca, !list_empty(&ca->discards));
411 do_discard(ca, bucket);
412 } else {
413 fifo_push(&ca->free, bucket);
414 closure_wake_up(&ca->set->bucket_wait);
415 }
416 }
417
418 /*
419 * We've run out of free buckets, we need to find some buckets
420 * we can invalidate. First, invalidate them in memory and add
421 * them to the free_inc list:
422 */
423
424 allocator_wait(ca, ca->set->gc_mark_valid &&
425 (ca->need_save_prio > 64 ||
426 !ca->invalidate_needs_gc));
427 invalidate_buckets(ca);
428
429 /*
430 * Now, we write their new gens to disk so we can start writing
431 * new stuff to them:
432 */
433 allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
434 if (CACHE_SYNC(&ca->set->sb) &&
435 (!fifo_empty(&ca->free_inc) ||
436 ca->need_save_prio > 64))
437 bch_prio_write(ca);
438 }
439}
440
441long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
442{
443 long r = -1;
444again:
445 wake_up(&ca->set->alloc_wait);
446
447 if (fifo_used(&ca->free) > ca->watermark[watermark] &&
448 fifo_pop(&ca->free, r)) {
449 struct bucket *b = ca->buckets + r;
450#ifdef CONFIG_BCACHE_EDEBUG
451 size_t iter;
452 long i;
453
454 for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
455 BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
456
457 fifo_for_each(i, &ca->free, iter)
458 BUG_ON(i == r);
459 fifo_for_each(i, &ca->free_inc, iter)
460 BUG_ON(i == r);
461 fifo_for_each(i, &ca->unused, iter)
462 BUG_ON(i == r);
463#endif
464 BUG_ON(atomic_read(&b->pin) != 1);
465
466 SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
467
468 if (watermark <= WATERMARK_METADATA) {
469 SET_GC_MARK(b, GC_MARK_METADATA);
470 b->prio = BTREE_PRIO;
471 } else {
472 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
473 b->prio = INITIAL_PRIO;
474 }
475
476 return r;
477 }
478
479 pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu",
480 atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
481 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
482
483 if (cl) {
484 closure_wait(&ca->set->bucket_wait, cl);
485
486 if (closure_blocking(cl)) {
487 mutex_unlock(&ca->set->bucket_lock);
488 closure_sync(cl);
489 mutex_lock(&ca->set->bucket_lock);
490 goto again;
491 }
492 }
493
494 return -1;
495}
496
497void bch_bucket_free(struct cache_set *c, struct bkey *k)
498{
499 unsigned i;
500
501 for (i = 0; i < KEY_PTRS(k); i++) {
502 struct bucket *b = PTR_BUCKET(c, k, i);
503
504 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
505 SET_GC_SECTORS_USED(b, 0);
506 bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
507 }
508}
509
510int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
511 struct bkey *k, int n, struct closure *cl)
512{
513 int i;
514
515 lockdep_assert_held(&c->bucket_lock);
516 BUG_ON(!n || n > c->caches_loaded || n > 8);
517
518 bkey_init(k);
519
520 /* sort by free space/prio of oldest data in caches */
521
522 for (i = 0; i < n; i++) {
523 struct cache *ca = c->cache_by_alloc[i];
524 long b = bch_bucket_alloc(ca, watermark, cl);
525
526 if (b == -1)
527 goto err;
528
529 k->ptr[i] = PTR(ca->buckets[b].gen,
530 bucket_to_sector(c, b),
531 ca->sb.nr_this_dev);
532
533 SET_KEY_PTRS(k, i + 1);
534 }
535
536 return 0;
537err:
538 bch_bucket_free(c, k);
539 __bkey_put(c, k);
540 return -1;
541}
542
543int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
544 struct bkey *k, int n, struct closure *cl)
545{
546 int ret;
547 mutex_lock(&c->bucket_lock);
548 ret = __bch_bucket_alloc_set(c, watermark, k, n, cl);
549 mutex_unlock(&c->bucket_lock);
550 return ret;
551}
552
553/* Init */
554
555void bch_cache_allocator_exit(struct cache *ca)
556{
557 struct discard *d;
558
559 while (!list_empty(&ca->discards)) {
560 d = list_first_entry(&ca->discards, struct discard, list);
561 cancel_work_sync(&d->work);
562 list_del(&d->list);
563 kfree(d);
564 }
565}
566
567int bch_cache_allocator_init(struct cache *ca)
568{
569 unsigned i;
570
571 /*
572 * Reserve:
573 * Prio/gen writes first
574 * Then 8 for btree allocations
575 * Then half for the moving garbage collector
576 */
577
578 ca->watermark[WATERMARK_PRIO] = 0;
579
580 ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
581
582 ca->watermark[WATERMARK_MOVINGGC] = 8 +
583 ca->watermark[WATERMARK_METADATA];
584
585 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
586 ca->watermark[WATERMARK_MOVINGGC];
587
588 for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) {
589 struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL);
590 if (!d)
591 return -ENOMEM;
592
593 d->ca = ca;
594 INIT_WORK(&d->work, discard_finish);
595 list_add(&d->list, &ca->discards);
596 }
597
598 return 0;
599}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
new file mode 100644
index 000000000000..340146d7c17f
--- /dev/null
+++ b/drivers/md/bcache/bcache.h
@@ -0,0 +1,1259 @@
1#ifndef _BCACHE_H
2#define _BCACHE_H
3
4/*
5 * SOME HIGH LEVEL CODE DOCUMENTATION:
6 *
7 * Bcache mostly works with cache sets, cache devices, and backing devices.
8 *
9 * Support for multiple cache devices hasn't quite been finished off yet, but
10 * it's about 95% plumbed through. A cache set and its cache devices is sort of
11 * like a md raid array and its component devices. Most of the code doesn't care
12 * about individual cache devices, the main abstraction is the cache set.
13 *
14 * Multiple cache devices is intended to give us the ability to mirror dirty
15 * cached data and metadata, without mirroring clean cached data.
16 *
17 * Backing devices are different, in that they have a lifetime independent of a
18 * cache set. When you register a newly formatted backing device it'll come up
19 * in passthrough mode, and then you can attach and detach a backing device from
20 * a cache set at runtime - while it's mounted and in use. Detaching implicitly
21 * invalidates any cached data for that backing device.
22 *
23 * A cache set can have multiple (many) backing devices attached to it.
24 *
25 * There's also flash only volumes - this is the reason for the distinction
26 * between struct cached_dev and struct bcache_device. A flash only volume
27 * works much like a bcache device that has a backing device, except the
28 * "cached" data is always dirty. The end result is that we get thin
29 * provisioning with very little additional code.
30 *
31 * Flash only volumes work but they're not production ready because the moving
32 * garbage collector needs more work. More on that later.
33 *
34 * BUCKETS/ALLOCATION:
35 *
36 * Bcache is primarily designed for caching, which means that in normal
37 * operation all of our available space will be allocated. Thus, we need an
38 * efficient way of deleting things from the cache so we can write new things to
39 * it.
40 *
41 * To do this, we first divide the cache device up into buckets. A bucket is the
42 * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
43 * works efficiently.
44 *
45 * Each bucket has a 16 bit priority, and an 8 bit generation associated with
46 * it. The gens and priorities for all the buckets are stored contiguously and
47 * packed on disk (in a linked list of buckets - aside from the superblock, all
48 * of bcache's metadata is stored in buckets).
49 *
50 * The priority is used to implement an LRU. We reset a bucket's priority when
51 * we allocate it or on cache it, and every so often we decrement the priority
52 * of each bucket. It could be used to implement something more sophisticated,
53 * if anyone ever gets around to it.
54 *
55 * The generation is used for invalidating buckets. Each pointer also has an 8
56 * bit generation embedded in it; for a pointer to be considered valid, its gen
57 * must match the gen of the bucket it points into. Thus, to reuse a bucket all
58 * we have to do is increment its gen (and write its new gen to disk; we batch
59 * this up).
60 *
61 * Bcache is entirely COW - we never write twice to a bucket, even buckets that
62 * contain metadata (including btree nodes).
63 *
64 * THE BTREE:
65 *
66 * Bcache is in large part design around the btree.
67 *
68 * At a high level, the btree is just an index of key -> ptr tuples.
69 *
70 * Keys represent extents, and thus have a size field. Keys also have a variable
71 * number of pointers attached to them (potentially zero, which is handy for
72 * invalidating the cache).
73 *
74 * The key itself is an inode:offset pair. The inode number corresponds to a
75 * backing device or a flash only volume. The offset is the ending offset of the
76 * extent within the inode - not the starting offset; this makes lookups
77 * slightly more convenient.
78 *
79 * Pointers contain the cache device id, the offset on that device, and an 8 bit
80 * generation number. More on the gen later.
81 *
82 * Index lookups are not fully abstracted - cache lookups in particular are
83 * still somewhat mixed in with the btree code, but things are headed in that
84 * direction.
85 *
86 * Updates are fairly well abstracted, though. There are two different ways of
87 * updating the btree; insert and replace.
88 *
89 * BTREE_INSERT will just take a list of keys and insert them into the btree -
90 * overwriting (possibly only partially) any extents they overlap with. This is
91 * used to update the index after a write.
92 *
93 * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
94 * overwriting a key that matches another given key. This is used for inserting
95 * data into the cache after a cache miss, and for background writeback, and for
96 * the moving garbage collector.
97 *
98 * There is no "delete" operation; deleting things from the index is
99 * accomplished by either by invalidating pointers (by incrementing a bucket's
100 * gen) or by inserting a key with 0 pointers - which will overwrite anything
101 * previously present at that location in the index.
102 *
103 * This means that there are always stale/invalid keys in the btree. They're
104 * filtered out by the code that iterates through a btree node, and removed when
105 * a btree node is rewritten.
106 *
107 * BTREE NODES:
108 *
109 * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
110 * free smaller than a bucket - so, that's how big our btree nodes are.
111 *
112 * (If buckets are really big we'll only use part of the bucket for a btree node
113 * - no less than 1/4th - but a bucket still contains no more than a single
114 * btree node. I'd actually like to change this, but for now we rely on the
115 * bucket's gen for deleting btree nodes when we rewrite/split a node.)
116 *
117 * Anyways, btree nodes are big - big enough to be inefficient with a textbook
118 * btree implementation.
119 *
120 * The way this is solved is that btree nodes are internally log structured; we
121 * can append new keys to an existing btree node without rewriting it. This
122 * means each set of keys we write is sorted, but the node is not.
123 *
124 * We maintain this log structure in memory - keeping 1Mb of keys sorted would
125 * be expensive, and we have to distinguish between the keys we have written and
126 * the keys we haven't. So to do a lookup in a btree node, we have to search
127 * each sorted set. But we do merge written sets together lazily, so the cost of
128 * these extra searches is quite low (normally most of the keys in a btree node
129 * will be in one big set, and then there'll be one or two sets that are much
130 * smaller).
131 *
132 * This log structure makes bcache's btree more of a hybrid between a
133 * conventional btree and a compacting data structure, with some of the
134 * advantages of both.
135 *
136 * GARBAGE COLLECTION:
137 *
138 * We can't just invalidate any bucket - it might contain dirty data or
139 * metadata. If it once contained dirty data, other writes might overwrite it
140 * later, leaving no valid pointers into that bucket in the index.
141 *
142 * Thus, the primary purpose of garbage collection is to find buckets to reuse.
143 * It also counts how much valid data it each bucket currently contains, so that
144 * allocation can reuse buckets sooner when they've been mostly overwritten.
145 *
146 * It also does some things that are really internal to the btree
147 * implementation. If a btree node contains pointers that are stale by more than
148 * some threshold, it rewrites the btree node to avoid the bucket's generation
149 * wrapping around. It also merges adjacent btree nodes if they're empty enough.
150 *
151 * THE JOURNAL:
152 *
153 * Bcache's journal is not necessary for consistency; we always strictly
154 * order metadata writes so that the btree and everything else is consistent on
155 * disk in the event of an unclean shutdown, and in fact bcache had writeback
156 * caching (with recovery from unclean shutdown) before journalling was
157 * implemented.
158 *
159 * Rather, the journal is purely a performance optimization; we can't complete a
160 * write until we've updated the index on disk, otherwise the cache would be
161 * inconsistent in the event of an unclean shutdown. This means that without the
162 * journal, on random write workloads we constantly have to update all the leaf
163 * nodes in the btree, and those writes will be mostly empty (appending at most
164 * a few keys each) - highly inefficient in terms of amount of metadata writes,
165 * and it puts more strain on the various btree resorting/compacting code.
166 *
167 * The journal is just a log of keys we've inserted; on startup we just reinsert
168 * all the keys in the open journal entries. That means that when we're updating
169 * a node in the btree, we can wait until a 4k block of keys fills up before
170 * writing them out.
171 *
172 * For simplicity, we only journal updates to leaf nodes; updates to parent
173 * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
174 * the complexity to deal with journalling them (in particular, journal replay)
175 * - updates to non leaf nodes just happen synchronously (see btree_split()).
176 */
177
178#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
179
180#include <linux/bio.h>
181#include <linux/blktrace_api.h>
182#include <linux/kobject.h>
183#include <linux/list.h>
184#include <linux/mutex.h>
185#include <linux/rbtree.h>
186#include <linux/rwsem.h>
187#include <linux/types.h>
188#include <linux/workqueue.h>
189
190#include "util.h"
191#include "closure.h"
192
193struct bucket {
194 atomic_t pin;
195 uint16_t prio;
196 uint8_t gen;
197 uint8_t disk_gen;
198 uint8_t last_gc; /* Most out of date gen in the btree */
199 uint8_t gc_gen;
200 uint16_t gc_mark;
201};
202
203/*
204 * I'd use bitfields for these, but I don't trust the compiler not to screw me
205 * as multiple threads touch struct bucket without locking
206 */
207
208BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
209#define GC_MARK_RECLAIMABLE 0
210#define GC_MARK_DIRTY 1
211#define GC_MARK_METADATA 2
212BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14);
213
214struct bkey {
215 uint64_t high;
216 uint64_t low;
217 uint64_t ptr[];
218};
219
220/* Enough for a key with 6 pointers */
221#define BKEY_PAD 8
222
223#define BKEY_PADDED(key) \
224 union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; }
225
226/* Version 0: Cache device
227 * Version 1: Backing device
228 * Version 2: Seed pointer into btree node checksum
229 * Version 3: Cache device with new UUID format
230 * Version 4: Backing device with data offset
231 */
232#define BCACHE_SB_VERSION_CDEV 0
233#define BCACHE_SB_VERSION_BDEV 1
234#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
235#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
236#define BCACHE_SB_MAX_VERSION 4
237
238#define SB_SECTOR 8
239#define SB_SIZE 4096
240#define SB_LABEL_SIZE 32
241#define SB_JOURNAL_BUCKETS 256U
242/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
243#define MAX_CACHES_PER_SET 8
244
245#define BDEV_DATA_START_DEFAULT 16 /* sectors */
246
247struct cache_sb {
248 uint64_t csum;
249 uint64_t offset; /* sector where this sb was written */
250 uint64_t version;
251
252 uint8_t magic[16];
253
254 uint8_t uuid[16];
255 union {
256 uint8_t set_uuid[16];
257 uint64_t set_magic;
258 };
259 uint8_t label[SB_LABEL_SIZE];
260
261 uint64_t flags;
262 uint64_t seq;
263 uint64_t pad[8];
264
265 union {
266 struct {
267 /* Cache devices */
268 uint64_t nbuckets; /* device size */
269
270 uint16_t block_size; /* sectors */
271 uint16_t bucket_size; /* sectors */
272
273 uint16_t nr_in_set;
274 uint16_t nr_this_dev;
275 };
276 struct {
277 /* Backing devices */
278 uint64_t data_offset;
279
280 /*
281 * block_size from the cache device section is still used by
282 * backing devices, so don't add anything here until we fix
283 * things to not need it for backing devices anymore
284 */
285 };
286 };
287
288 uint32_t last_mount; /* time_t */
289
290 uint16_t first_bucket;
291 union {
292 uint16_t njournal_buckets;
293 uint16_t keys;
294 };
295 uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */
296};
297
298BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1);
299BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1);
300BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3);
301#define CACHE_REPLACEMENT_LRU 0U
302#define CACHE_REPLACEMENT_FIFO 1U
303#define CACHE_REPLACEMENT_RANDOM 2U
304
305BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4);
306#define CACHE_MODE_WRITETHROUGH 0U
307#define CACHE_MODE_WRITEBACK 1U
308#define CACHE_MODE_WRITEAROUND 2U
309#define CACHE_MODE_NONE 3U
310BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2);
311#define BDEV_STATE_NONE 0U
312#define BDEV_STATE_CLEAN 1U
313#define BDEV_STATE_DIRTY 2U
314#define BDEV_STATE_STALE 3U
315
316/* Version 1: Seed pointer into btree node checksum
317 */
318#define BCACHE_BSET_VERSION 1
319
320/*
321 * This is the on disk format for btree nodes - a btree node on disk is a list
322 * of these; within each set the keys are sorted
323 */
324struct bset {
325 uint64_t csum;
326 uint64_t magic;
327 uint64_t seq;
328 uint32_t version;
329 uint32_t keys;
330
331 union {
332 struct bkey start[0];
333 uint64_t d[0];
334 };
335};
336
337/*
338 * On disk format for priorities and gens - see super.c near prio_write() for
339 * more.
340 */
341struct prio_set {
342 uint64_t csum;
343 uint64_t magic;
344 uint64_t seq;
345 uint32_t version;
346 uint32_t pad;
347
348 uint64_t next_bucket;
349
350 struct bucket_disk {
351 uint16_t prio;
352 uint8_t gen;
353 } __attribute((packed)) data[];
354};
355
356struct uuid_entry {
357 union {
358 struct {
359 uint8_t uuid[16];
360 uint8_t label[32];
361 uint32_t first_reg;
362 uint32_t last_reg;
363 uint32_t invalidated;
364
365 uint32_t flags;
366 /* Size of flash only volumes */
367 uint64_t sectors;
368 };
369
370 uint8_t pad[128];
371 };
372};
373
374BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1);
375
376#include "journal.h"
377#include "stats.h"
378struct search;
379struct btree;
380struct keybuf;
381
382struct keybuf_key {
383 struct rb_node node;
384 BKEY_PADDED(key);
385 void *private;
386};
387
388typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
389
390struct keybuf {
391 keybuf_pred_fn *key_predicate;
392
393 struct bkey last_scanned;
394 spinlock_t lock;
395
396 /*
397 * Beginning and end of range in rb tree - so that we can skip taking
398 * lock and checking the rb tree when we need to check for overlapping
399 * keys.
400 */
401 struct bkey start;
402 struct bkey end;
403
404 struct rb_root keys;
405
406#define KEYBUF_NR 100
407 DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
408};
409
410struct bio_split_pool {
411 struct bio_set *bio_split;
412 mempool_t *bio_split_hook;
413};
414
415struct bio_split_hook {
416 struct closure cl;
417 struct bio_split_pool *p;
418 struct bio *bio;
419 bio_end_io_t *bi_end_io;
420 void *bi_private;
421};
422
423struct bcache_device {
424 struct closure cl;
425
426 struct kobject kobj;
427
428 struct cache_set *c;
429 unsigned id;
430#define BCACHEDEVNAME_SIZE 12
431 char name[BCACHEDEVNAME_SIZE];
432
433 struct gendisk *disk;
434
435 /* If nonzero, we're closing */
436 atomic_t closing;
437
438 /* If nonzero, we're detaching/unregistering from cache set */
439 atomic_t detaching;
440
441 atomic_long_t sectors_dirty;
442 unsigned long sectors_dirty_gc;
443 unsigned long sectors_dirty_last;
444 long sectors_dirty_derivative;
445
446 mempool_t *unaligned_bvec;
447 struct bio_set *bio_split;
448
449 unsigned data_csum:1;
450
451 int (*cache_miss)(struct btree *, struct search *,
452 struct bio *, unsigned);
453 int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long);
454
455 struct bio_split_pool bio_split_hook;
456};
457
458struct io {
459 /* Used to track sequential IO so it can be skipped */
460 struct hlist_node hash;
461 struct list_head lru;
462
463 unsigned long jiffies;
464 unsigned sequential;
465 sector_t last;
466};
467
468struct cached_dev {
469 struct list_head list;
470 struct bcache_device disk;
471 struct block_device *bdev;
472
473 struct cache_sb sb;
474 struct bio sb_bio;
475 struct bio_vec sb_bv[1];
476 struct closure_with_waitlist sb_write;
477
478 /* Refcount on the cache set. Always nonzero when we're caching. */
479 atomic_t count;
480 struct work_struct detach;
481
482 /*
483 * Device might not be running if it's dirty and the cache set hasn't
484 * showed up yet.
485 */
486 atomic_t running;
487
488 /*
489 * Writes take a shared lock from start to finish; scanning for dirty
490 * data to refill the rb tree requires an exclusive lock.
491 */
492 struct rw_semaphore writeback_lock;
493
494 /*
495 * Nonzero, and writeback has a refcount (d->count), iff there is dirty
496 * data in the cache. Protected by writeback_lock; must have an
497 * shared lock to set and exclusive lock to clear.
498 */
499 atomic_t has_dirty;
500
501 struct ratelimit writeback_rate;
502 struct delayed_work writeback_rate_update;
503
504 /*
505 * Internal to the writeback code, so read_dirty() can keep track of
506 * where it's at.
507 */
508 sector_t last_read;
509
510 /* Number of writeback bios in flight */
511 atomic_t in_flight;
512 struct closure_with_timer writeback;
513 struct closure_waitlist writeback_wait;
514
515 struct keybuf writeback_keys;
516
517 /* For tracking sequential IO */
518#define RECENT_IO_BITS 7
519#define RECENT_IO (1 << RECENT_IO_BITS)
520 struct io io[RECENT_IO];
521 struct hlist_head io_hash[RECENT_IO + 1];
522 struct list_head io_lru;
523 spinlock_t io_lock;
524
525 struct cache_accounting accounting;
526
527 /* The rest of this all shows up in sysfs */
528 unsigned sequential_cutoff;
529 unsigned readahead;
530
531 unsigned sequential_merge:1;
532 unsigned verify:1;
533
534 unsigned writeback_metadata:1;
535 unsigned writeback_running:1;
536 unsigned char writeback_percent;
537 unsigned writeback_delay;
538
539 int writeback_rate_change;
540 int64_t writeback_rate_derivative;
541 uint64_t writeback_rate_target;
542
543 unsigned writeback_rate_update_seconds;
544 unsigned writeback_rate_d_term;
545 unsigned writeback_rate_p_term_inverse;
546 unsigned writeback_rate_d_smooth;
547};
548
549enum alloc_watermarks {
550 WATERMARK_PRIO,
551 WATERMARK_METADATA,
552 WATERMARK_MOVINGGC,
553 WATERMARK_NONE,
554 WATERMARK_MAX
555};
556
557struct cache {
558 struct cache_set *set;
559 struct cache_sb sb;
560 struct bio sb_bio;
561 struct bio_vec sb_bv[1];
562
563 struct kobject kobj;
564 struct block_device *bdev;
565
566 unsigned watermark[WATERMARK_MAX];
567
568 struct closure alloc;
569 struct workqueue_struct *alloc_workqueue;
570
571 struct closure prio;
572 struct prio_set *disk_buckets;
573
574 /*
575 * When allocating new buckets, prio_write() gets first dibs - since we
576 * may not be allocate at all without writing priorities and gens.
577 * prio_buckets[] contains the last buckets we wrote priorities to (so
578 * gc can mark them as metadata), prio_next[] contains the buckets
579 * allocated for the next prio write.
580 */
581 uint64_t *prio_buckets;
582 uint64_t *prio_last_buckets;
583
584 /*
585 * free: Buckets that are ready to be used
586 *
587 * free_inc: Incoming buckets - these are buckets that currently have
588 * cached data in them, and we can't reuse them until after we write
589 * their new gen to disk. After prio_write() finishes writing the new
590 * gens/prios, they'll be moved to the free list (and possibly discarded
591 * in the process)
592 *
593 * unused: GC found nothing pointing into these buckets (possibly
594 * because all the data they contained was overwritten), so we only
595 * need to discard them before they can be moved to the free list.
596 */
597 DECLARE_FIFO(long, free);
598 DECLARE_FIFO(long, free_inc);
599 DECLARE_FIFO(long, unused);
600
601 size_t fifo_last_bucket;
602
603 /* Allocation stuff: */
604 struct bucket *buckets;
605
606 DECLARE_HEAP(struct bucket *, heap);
607
608 /*
609 * max(gen - disk_gen) for all buckets. When it gets too big we have to
610 * call prio_write() to keep gens from wrapping.
611 */
612 uint8_t need_save_prio;
613 unsigned gc_move_threshold;
614
615 /*
616 * If nonzero, we know we aren't going to find any buckets to invalidate
617 * until a gc finishes - otherwise we could pointlessly burn a ton of
618 * cpu
619 */
620 unsigned invalidate_needs_gc:1;
621
622 bool discard; /* Get rid of? */
623
624 /*
625 * We preallocate structs for issuing discards to buckets, and keep them
626 * on this list when they're not in use; do_discard() issues discards
627 * whenever there's work to do and is called by free_some_buckets() and
628 * when a discard finishes.
629 */
630 atomic_t discards_in_flight;
631 struct list_head discards;
632
633 struct journal_device journal;
634
635 /* The rest of this all shows up in sysfs */
636#define IO_ERROR_SHIFT 20
637 atomic_t io_errors;
638 atomic_t io_count;
639
640 atomic_long_t meta_sectors_written;
641 atomic_long_t btree_sectors_written;
642 atomic_long_t sectors_written;
643
644 struct bio_split_pool bio_split_hook;
645};
646
647struct gc_stat {
648 size_t nodes;
649 size_t key_bytes;
650
651 size_t nkeys;
652 uint64_t data; /* sectors */
653 uint64_t dirty; /* sectors */
654 unsigned in_use; /* percent */
655};
656
657/*
658 * Flag bits, for how the cache set is shutting down, and what phase it's at:
659 *
660 * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching
661 * all the backing devices first (their cached data gets invalidated, and they
662 * won't automatically reattach).
663 *
664 * CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
665 * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
666 * flushing dirty data).
667 *
668 * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down
669 * the allocation thread.
670 */
671#define CACHE_SET_UNREGISTERING 0
672#define CACHE_SET_STOPPING 1
673#define CACHE_SET_STOPPING_2 2
674
675struct cache_set {
676 struct closure cl;
677
678 struct list_head list;
679 struct kobject kobj;
680 struct kobject internal;
681 struct dentry *debug;
682 struct cache_accounting accounting;
683
684 unsigned long flags;
685
686 struct cache_sb sb;
687
688 struct cache *cache[MAX_CACHES_PER_SET];
689 struct cache *cache_by_alloc[MAX_CACHES_PER_SET];
690 int caches_loaded;
691
692 struct bcache_device **devices;
693 struct list_head cached_devs;
694 uint64_t cached_dev_sectors;
695 struct closure caching;
696
697 struct closure_with_waitlist sb_write;
698
699 mempool_t *search;
700 mempool_t *bio_meta;
701 struct bio_set *bio_split;
702
703 /* For the btree cache */
704 struct shrinker shrink;
705
706 /* For the allocator itself */
707 wait_queue_head_t alloc_wait;
708
709 /* For the btree cache and anything allocation related */
710 struct mutex bucket_lock;
711
712 /* log2(bucket_size), in sectors */
713 unsigned short bucket_bits;
714
715 /* log2(block_size), in sectors */
716 unsigned short block_bits;
717
718 /*
719 * Default number of pages for a new btree node - may be less than a
720 * full bucket
721 */
722 unsigned btree_pages;
723
724 /*
725 * Lists of struct btrees; lru is the list for structs that have memory
726 * allocated for actual btree node, freed is for structs that do not.
727 *
728 * We never free a struct btree, except on shutdown - we just put it on
729 * the btree_cache_freed list and reuse it later. This simplifies the
730 * code, and it doesn't cost us much memory as the memory usage is
731 * dominated by buffers that hold the actual btree node data and those
732 * can be freed - and the number of struct btrees allocated is
733 * effectively bounded.
734 *
735 * btree_cache_freeable effectively is a small cache - we use it because
736 * high order page allocations can be rather expensive, and it's quite
737 * common to delete and allocate btree nodes in quick succession. It
738 * should never grow past ~2-3 nodes in practice.
739 */
740 struct list_head btree_cache;
741 struct list_head btree_cache_freeable;
742 struct list_head btree_cache_freed;
743
744 /* Number of elements in btree_cache + btree_cache_freeable lists */
745 unsigned bucket_cache_used;
746
747 /*
748 * If we need to allocate memory for a new btree node and that
749 * allocation fails, we can cannibalize another node in the btree cache
750 * to satisfy the allocation. However, only one thread can be doing this
751 * at a time, for obvious reasons - try_harder and try_wait are
752 * basically a lock for this that we can wait on asynchronously. The
753 * btree_root() macro releases the lock when it returns.
754 */
755 struct closure *try_harder;
756 struct closure_waitlist try_wait;
757 uint64_t try_harder_start;
758
759 /*
760 * When we free a btree node, we increment the gen of the bucket the
761 * node is in - but we can't rewrite the prios and gens until we
762 * finished whatever it is we were doing, otherwise after a crash the
763 * btree node would be freed but for say a split, we might not have the
764 * pointers to the new nodes inserted into the btree yet.
765 *
766 * This is a refcount that blocks prio_write() until the new keys are
767 * written.
768 */
769 atomic_t prio_blocked;
770 struct closure_waitlist bucket_wait;
771
772 /*
773 * For any bio we don't skip we subtract the number of sectors from
774 * rescale; when it hits 0 we rescale all the bucket priorities.
775 */
776 atomic_t rescale;
777 /*
778 * When we invalidate buckets, we use both the priority and the amount
779 * of good data to determine which buckets to reuse first - to weight
780 * those together consistently we keep track of the smallest nonzero
781 * priority of any bucket.
782 */
783 uint16_t min_prio;
784
785 /*
786 * max(gen - gc_gen) for all buckets. When it gets too big we have to gc
787 * to keep gens from wrapping around.
788 */
789 uint8_t need_gc;
790 struct gc_stat gc_stats;
791 size_t nbuckets;
792
793 struct closure_with_waitlist gc;
794 /* Where in the btree gc currently is */
795 struct bkey gc_done;
796
797 /*
798 * The allocation code needs gc_mark in struct bucket to be correct, but
799 * it's not while a gc is in progress. Protected by bucket_lock.
800 */
801 int gc_mark_valid;
802
803 /* Counts how many sectors bio_insert has added to the cache */
804 atomic_t sectors_to_gc;
805
806 struct closure moving_gc;
807 struct closure_waitlist moving_gc_wait;
808 struct keybuf moving_gc_keys;
809 /* Number of moving GC bios in flight */
810 atomic_t in_flight;
811
812 struct btree *root;
813
814#ifdef CONFIG_BCACHE_DEBUG
815 struct btree *verify_data;
816 struct mutex verify_lock;
817#endif
818
819 unsigned nr_uuids;
820 struct uuid_entry *uuids;
821 BKEY_PADDED(uuid_bucket);
822 struct closure_with_waitlist uuid_write;
823
824 /*
825 * A btree node on disk could have too many bsets for an iterator to fit
826 * on the stack - this is a single element mempool for btree_read_work()
827 */
828 struct mutex fill_lock;
829 struct btree_iter *fill_iter;
830
831 /*
832 * btree_sort() is a merge sort and requires temporary space - single
833 * element mempool
834 */
835 struct mutex sort_lock;
836 struct bset *sort;
837
838 /* List of buckets we're currently writing data to */
839 struct list_head data_buckets;
840 spinlock_t data_bucket_lock;
841
842 struct journal journal;
843
844#define CONGESTED_MAX 1024
845 unsigned congested_last_us;
846 atomic_t congested;
847
848 /* The rest of this all shows up in sysfs */
849 unsigned congested_read_threshold_us;
850 unsigned congested_write_threshold_us;
851
852 spinlock_t sort_time_lock;
853 struct time_stats sort_time;
854 struct time_stats btree_gc_time;
855 struct time_stats btree_split_time;
856 spinlock_t btree_read_time_lock;
857 struct time_stats btree_read_time;
858 struct time_stats try_harder_time;
859
860 atomic_long_t cache_read_races;
861 atomic_long_t writeback_keys_done;
862 atomic_long_t writeback_keys_failed;
863 unsigned error_limit;
864 unsigned error_decay;
865 unsigned short journal_delay_ms;
866 unsigned verify:1;
867 unsigned key_merging_disabled:1;
868 unsigned gc_always_rewrite:1;
869 unsigned shrinker_disabled:1;
870 unsigned copy_gc_enabled:1;
871
872#define BUCKET_HASH_BITS 12
873 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
874};
875
876static inline bool key_merging_disabled(struct cache_set *c)
877{
878#ifdef CONFIG_BCACHE_DEBUG
879 return c->key_merging_disabled;
880#else
881 return 0;
882#endif
883}
884
885static inline bool SB_IS_BDEV(const struct cache_sb *sb)
886{
887 return sb->version == BCACHE_SB_VERSION_BDEV
888 || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
889}
890
891struct bbio {
892 unsigned submit_time_us;
893 union {
894 struct bkey key;
895 uint64_t _pad[3];
896 /*
897 * We only need pad = 3 here because we only ever carry around a
898 * single pointer - i.e. the pointer we're doing io to/from.
899 */
900 };
901 struct bio bio;
902};
903
904static inline unsigned local_clock_us(void)
905{
906 return local_clock() >> 10;
907}
908
909#define MAX_BSETS 4U
910
911#define BTREE_PRIO USHRT_MAX
912#define INITIAL_PRIO 32768
913
914#define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE)
915#define btree_blocks(b) \
916 ((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits))
917
918#define btree_default_blocks(c) \
919 ((unsigned) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits))
920
921#define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS)
922#define bucket_bytes(c) ((c)->sb.bucket_size << 9)
923#define block_bytes(c) ((c)->sb.block_size << 9)
924
925#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t))
926#define set_bytes(i) __set_bytes(i, i->keys)
927
928#define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c))
929#define set_blocks(i, c) __set_blocks(i, (i)->keys, c)
930
931#define node(i, j) ((struct bkey *) ((i)->d + (j)))
932#define end(i) node(i, (i)->keys)
933
934#define index(i, b) \
935 ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \
936 block_bytes(b->c)))
937
938#define btree_data_space(b) (PAGE_SIZE << (b)->page_order)
939
940#define prios_per_bucket(c) \
941 ((bucket_bytes(c) - sizeof(struct prio_set)) / \
942 sizeof(struct bucket_disk))
943#define prio_buckets(c) \
944 DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
945
946#define JSET_MAGIC 0x245235c1a3625032ULL
947#define PSET_MAGIC 0x6750e15f87337f91ULL
948#define BSET_MAGIC 0x90135c78b99e07f5ULL
949
950#define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC)
951#define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC)
952#define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC)
953
954/* Bkey fields: all units are in sectors */
955
956#define KEY_FIELD(name, field, offset, size) \
957 BITMASK(name, struct bkey, field, offset, size)
958
959#define PTR_FIELD(name, offset, size) \
960 static inline uint64_t name(const struct bkey *k, unsigned i) \
961 { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \
962 \
963 static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\
964 { \
965 k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \
966 k->ptr[i] |= v << offset; \
967 }
968
969KEY_FIELD(KEY_PTRS, high, 60, 3)
970KEY_FIELD(HEADER_SIZE, high, 58, 2)
971KEY_FIELD(KEY_CSUM, high, 56, 2)
972KEY_FIELD(KEY_PINNED, high, 55, 1)
973KEY_FIELD(KEY_DIRTY, high, 36, 1)
974
975KEY_FIELD(KEY_SIZE, high, 20, 16)
976KEY_FIELD(KEY_INODE, high, 0, 20)
977
978/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
979
980static inline uint64_t KEY_OFFSET(const struct bkey *k)
981{
982 return k->low;
983}
984
985static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v)
986{
987 k->low = v;
988}
989
990PTR_FIELD(PTR_DEV, 51, 12)
991PTR_FIELD(PTR_OFFSET, 8, 43)
992PTR_FIELD(PTR_GEN, 0, 8)
993
994#define PTR_CHECK_DEV ((1 << 12) - 1)
995
996#define PTR(gen, offset, dev) \
997 ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen)
998
999static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
1000{
1001 return s >> c->bucket_bits;
1002}
1003
1004static inline sector_t bucket_to_sector(struct cache_set *c, size_t b)
1005{
1006 return ((sector_t) b) << c->bucket_bits;
1007}
1008
1009static inline sector_t bucket_remainder(struct cache_set *c, sector_t s)
1010{
1011 return s & (c->sb.bucket_size - 1);
1012}
1013
1014static inline struct cache *PTR_CACHE(struct cache_set *c,
1015 const struct bkey *k,
1016 unsigned ptr)
1017{
1018 return c->cache[PTR_DEV(k, ptr)];
1019}
1020
1021static inline size_t PTR_BUCKET_NR(struct cache_set *c,
1022 const struct bkey *k,
1023 unsigned ptr)
1024{
1025 return sector_to_bucket(c, PTR_OFFSET(k, ptr));
1026}
1027
1028static inline struct bucket *PTR_BUCKET(struct cache_set *c,
1029 const struct bkey *k,
1030 unsigned ptr)
1031{
1032 return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
1033}
1034
1035/* Btree key macros */
1036
1037/*
1038 * The high bit being set is a relic from when we used it to do binary
1039 * searches - it told you where a key started. It's not used anymore,
1040 * and can probably be safely dropped.
1041 */
1042#define KEY(dev, sector, len) \
1043((struct bkey) { \
1044 .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \
1045 .low = (sector) \
1046})
1047
1048static inline void bkey_init(struct bkey *k)
1049{
1050 *k = KEY(0, 0, 0);
1051}
1052
1053#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k))
1054#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0)
1055#define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0)
1056#define ZERO_KEY KEY(0, 0, 0)
1057
1058/*
1059 * This is used for various on disk data structures - cache_sb, prio_set, bset,
1060 * jset: The checksum is _always_ the first 8 bytes of these structs
1061 */
1062#define csum_set(i) \
1063 bch_crc64(((void *) (i)) + sizeof(uint64_t), \
1064 ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t)))
1065
1066/* Error handling macros */
1067
1068#define btree_bug(b, ...) \
1069do { \
1070 if (bch_cache_set_error((b)->c, __VA_ARGS__)) \
1071 dump_stack(); \
1072} while (0)
1073
1074#define cache_bug(c, ...) \
1075do { \
1076 if (bch_cache_set_error(c, __VA_ARGS__)) \
1077 dump_stack(); \
1078} while (0)
1079
1080#define btree_bug_on(cond, b, ...) \
1081do { \
1082 if (cond) \
1083 btree_bug(b, __VA_ARGS__); \
1084} while (0)
1085
1086#define cache_bug_on(cond, c, ...) \
1087do { \
1088 if (cond) \
1089 cache_bug(c, __VA_ARGS__); \
1090} while (0)
1091
1092#define cache_set_err_on(cond, c, ...) \
1093do { \
1094 if (cond) \
1095 bch_cache_set_error(c, __VA_ARGS__); \
1096} while (0)
1097
1098/* Looping macros */
1099
1100#define for_each_cache(ca, cs, iter) \
1101 for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++)
1102
1103#define for_each_bucket(b, ca) \
1104 for (b = (ca)->buckets + (ca)->sb.first_bucket; \
1105 b < (ca)->buckets + (ca)->sb.nbuckets; b++)
1106
1107static inline void __bkey_put(struct cache_set *c, struct bkey *k)
1108{
1109 unsigned i;
1110
1111 for (i = 0; i < KEY_PTRS(k); i++)
1112 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
1113}
1114
1115/* Blktrace macros */
1116
1117#define blktrace_msg(c, fmt, ...) \
1118do { \
1119 struct request_queue *q = bdev_get_queue(c->bdev); \
1120 if (q) \
1121 blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \
1122} while (0)
1123
1124#define blktrace_msg_all(s, fmt, ...) \
1125do { \
1126 struct cache *_c; \
1127 unsigned i; \
1128 for_each_cache(_c, (s), i) \
1129 blktrace_msg(_c, fmt, ##__VA_ARGS__); \
1130} while (0)
1131
1132static inline void cached_dev_put(struct cached_dev *dc)
1133{
1134 if (atomic_dec_and_test(&dc->count))
1135 schedule_work(&dc->detach);
1136}
1137
1138static inline bool cached_dev_get(struct cached_dev *dc)
1139{
1140 if (!atomic_inc_not_zero(&dc->count))
1141 return false;
1142
1143 /* Paired with the mb in cached_dev_attach */
1144 smp_mb__after_atomic_inc();
1145 return true;
1146}
1147
1148/*
1149 * bucket_gc_gen() returns the difference between the bucket's current gen and
1150 * the oldest gen of any pointer into that bucket in the btree (last_gc).
1151 *
1152 * bucket_disk_gen() returns the difference between the current gen and the gen
1153 * on disk; they're both used to make sure gens don't wrap around.
1154 */
1155
1156static inline uint8_t bucket_gc_gen(struct bucket *b)
1157{
1158 return b->gen - b->last_gc;
1159}
1160
1161static inline uint8_t bucket_disk_gen(struct bucket *b)
1162{
1163 return b->gen - b->disk_gen;
1164}
1165
1166#define BUCKET_GC_GEN_MAX 96U
1167#define BUCKET_DISK_GEN_MAX 64U
1168
1169#define kobj_attribute_write(n, fn) \
1170 static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
1171
1172#define kobj_attribute_rw(n, show, store) \
1173 static struct kobj_attribute ksysfs_##n = \
1174 __ATTR(n, S_IWUSR|S_IRUSR, show, store)
1175
1176/* Forward declarations */
1177
1178void bch_writeback_queue(struct cached_dev *);
1179void bch_writeback_add(struct cached_dev *, unsigned);
1180
1181void bch_count_io_errors(struct cache *, int, const char *);
1182void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
1183 int, const char *);
1184void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *);
1185void bch_bbio_free(struct bio *, struct cache_set *);
1186struct bio *bch_bbio_alloc(struct cache_set *);
1187
1188struct bio *bch_bio_split(struct bio *, int, gfp_t, struct bio_set *);
1189void bch_generic_make_request(struct bio *, struct bio_split_pool *);
1190void __bch_submit_bbio(struct bio *, struct cache_set *);
1191void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
1192
1193uint8_t bch_inc_gen(struct cache *, struct bucket *);
1194void bch_rescale_priorities(struct cache_set *, int);
1195bool bch_bucket_add_unused(struct cache *, struct bucket *);
1196void bch_allocator_thread(struct closure *);
1197
1198long bch_bucket_alloc(struct cache *, unsigned, struct closure *);
1199void bch_bucket_free(struct cache_set *, struct bkey *);
1200
1201int __bch_bucket_alloc_set(struct cache_set *, unsigned,
1202 struct bkey *, int, struct closure *);
1203int bch_bucket_alloc_set(struct cache_set *, unsigned,
1204 struct bkey *, int, struct closure *);
1205
1206__printf(2, 3)
1207bool bch_cache_set_error(struct cache_set *, const char *, ...);
1208
1209void bch_prio_write(struct cache *);
1210void bch_write_bdev_super(struct cached_dev *, struct closure *);
1211
1212extern struct workqueue_struct *bcache_wq, *bch_gc_wq;
1213extern const char * const bch_cache_modes[];
1214extern struct mutex bch_register_lock;
1215extern struct list_head bch_cache_sets;
1216
1217extern struct kobj_type bch_cached_dev_ktype;
1218extern struct kobj_type bch_flash_dev_ktype;
1219extern struct kobj_type bch_cache_set_ktype;
1220extern struct kobj_type bch_cache_set_internal_ktype;
1221extern struct kobj_type bch_cache_ktype;
1222
1223void bch_cached_dev_release(struct kobject *);
1224void bch_flash_dev_release(struct kobject *);
1225void bch_cache_set_release(struct kobject *);
1226void bch_cache_release(struct kobject *);
1227
1228int bch_uuid_write(struct cache_set *);
1229void bcache_write_super(struct cache_set *);
1230
1231int bch_flash_dev_create(struct cache_set *c, uint64_t size);
1232
1233int bch_cached_dev_attach(struct cached_dev *, struct cache_set *);
1234void bch_cached_dev_detach(struct cached_dev *);
1235void bch_cached_dev_run(struct cached_dev *);
1236void bcache_device_stop(struct bcache_device *);
1237
1238void bch_cache_set_unregister(struct cache_set *);
1239void bch_cache_set_stop(struct cache_set *);
1240
1241struct cache_set *bch_cache_set_alloc(struct cache_sb *);
1242void bch_btree_cache_free(struct cache_set *);
1243int bch_btree_cache_alloc(struct cache_set *);
1244void bch_writeback_init_cached_dev(struct cached_dev *);
1245void bch_moving_init_cache_set(struct cache_set *);
1246
1247void bch_cache_allocator_exit(struct cache *ca);
1248int bch_cache_allocator_init(struct cache *ca);
1249
1250void bch_debug_exit(void);
1251int bch_debug_init(struct kobject *);
1252void bch_writeback_exit(void);
1253int bch_writeback_init(void);
1254void bch_request_exit(void);
1255int bch_request_init(void);
1256void bch_btree_exit(void);
1257int bch_btree_init(void);
1258
1259#endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
new file mode 100644
index 000000000000..cb4578a327b9
--- /dev/null
+++ b/drivers/md/bcache/bset.c
@@ -0,0 +1,1192 @@
1/*
2 * Code for working with individual keys, and sorted sets of keys with in a
3 * btree node
4 *
5 * Copyright 2012 Google, Inc.
6 */
7
8#include "bcache.h"
9#include "btree.h"
10#include "debug.h"
11
12#include <linux/random.h>
13#include <linux/prefetch.h>
14
15/* Keylists */
16
17void bch_keylist_copy(struct keylist *dest, struct keylist *src)
18{
19 *dest = *src;
20
21 if (src->list == src->d) {
22 size_t n = (uint64_t *) src->top - src->d;
23 dest->top = (struct bkey *) &dest->d[n];
24 dest->list = dest->d;
25 }
26}
27
28int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
29{
30 unsigned oldsize = (uint64_t *) l->top - l->list;
31 unsigned newsize = oldsize + 2 + nptrs;
32 uint64_t *new;
33
34 /* The journalling code doesn't handle the case where the keys to insert
35 * is bigger than an empty write: If we just return -ENOMEM here,
36 * bio_insert() and bio_invalidate() will insert the keys created so far
37 * and finish the rest when the keylist is empty.
38 */
39 if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
40 return -ENOMEM;
41
42 newsize = roundup_pow_of_two(newsize);
43
44 if (newsize <= KEYLIST_INLINE ||
45 roundup_pow_of_two(oldsize) == newsize)
46 return 0;
47
48 new = krealloc(l->list == l->d ? NULL : l->list,
49 sizeof(uint64_t) * newsize, GFP_NOIO);
50
51 if (!new)
52 return -ENOMEM;
53
54 if (l->list == l->d)
55 memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE);
56
57 l->list = new;
58 l->top = (struct bkey *) (&l->list[oldsize]);
59
60 return 0;
61}
62
63struct bkey *bch_keylist_pop(struct keylist *l)
64{
65 struct bkey *k = l->bottom;
66
67 if (k == l->top)
68 return NULL;
69
70 while (bkey_next(k) != l->top)
71 k = bkey_next(k);
72
73 return l->top = k;
74}
75
76/* Pointer validation */
77
78bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
79{
80 unsigned i;
81
82 if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)))
83 goto bad;
84
85 if (!level && KEY_SIZE(k) > KEY_OFFSET(k))
86 goto bad;
87
88 if (!KEY_SIZE(k))
89 return true;
90
91 for (i = 0; i < KEY_PTRS(k); i++)
92 if (ptr_available(c, k, i)) {
93 struct cache *ca = PTR_CACHE(c, k, i);
94 size_t bucket = PTR_BUCKET_NR(c, k, i);
95 size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
96
97 if (KEY_SIZE(k) + r > c->sb.bucket_size ||
98 bucket < ca->sb.first_bucket ||
99 bucket >= ca->sb.nbuckets)
100 goto bad;
101 }
102
103 return false;
104bad:
105 cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k));
106 return true;
107}
108
109bool bch_ptr_bad(struct btree *b, const struct bkey *k)
110{
111 struct bucket *g;
112 unsigned i, stale;
113
114 if (!bkey_cmp(k, &ZERO_KEY) ||
115 !KEY_PTRS(k) ||
116 bch_ptr_invalid(b, k))
117 return true;
118
119 if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV)
120 return true;
121
122 for (i = 0; i < KEY_PTRS(k); i++)
123 if (ptr_available(b->c, k, i)) {
124 g = PTR_BUCKET(b->c, k, i);
125 stale = ptr_stale(b->c, k, i);
126
127 btree_bug_on(stale > 96, b,
128 "key too stale: %i, need_gc %u",
129 stale, b->c->need_gc);
130
131 btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
132 b, "stale dirty pointer");
133
134 if (stale)
135 return true;
136
137#ifdef CONFIG_BCACHE_EDEBUG
138 if (!mutex_trylock(&b->c->bucket_lock))
139 continue;
140
141 if (b->level) {
142 if (KEY_DIRTY(k) ||
143 g->prio != BTREE_PRIO ||
144 (b->c->gc_mark_valid &&
145 GC_MARK(g) != GC_MARK_METADATA))
146 goto bug;
147
148 } else {
149 if (g->prio == BTREE_PRIO)
150 goto bug;
151
152 if (KEY_DIRTY(k) &&
153 b->c->gc_mark_valid &&
154 GC_MARK(g) != GC_MARK_DIRTY)
155 goto bug;
156 }
157 mutex_unlock(&b->c->bucket_lock);
158#endif
159 }
160
161 return false;
162#ifdef CONFIG_BCACHE_EDEBUG
163bug:
164 mutex_unlock(&b->c->bucket_lock);
165 btree_bug(b,
166"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
167 pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
168 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
169 return true;
170#endif
171}
172
173/* Key/pointer manipulation */
174
175void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
176 unsigned i)
177{
178 BUG_ON(i > KEY_PTRS(src));
179
180 /* Only copy the header, key, and one pointer. */
181 memcpy(dest, src, 2 * sizeof(uint64_t));
182 dest->ptr[0] = src->ptr[i];
183 SET_KEY_PTRS(dest, 1);
184 /* We didn't copy the checksum so clear that bit. */
185 SET_KEY_CSUM(dest, 0);
186}
187
188bool __bch_cut_front(const struct bkey *where, struct bkey *k)
189{
190 unsigned i, len = 0;
191
192 if (bkey_cmp(where, &START_KEY(k)) <= 0)
193 return false;
194
195 if (bkey_cmp(where, k) < 0)
196 len = KEY_OFFSET(k) - KEY_OFFSET(where);
197 else
198 bkey_copy_key(k, where);
199
200 for (i = 0; i < KEY_PTRS(k); i++)
201 SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + KEY_SIZE(k) - len);
202
203 BUG_ON(len > KEY_SIZE(k));
204 SET_KEY_SIZE(k, len);
205 return true;
206}
207
208bool __bch_cut_back(const struct bkey *where, struct bkey *k)
209{
210 unsigned len = 0;
211
212 if (bkey_cmp(where, k) >= 0)
213 return false;
214
215 BUG_ON(KEY_INODE(where) != KEY_INODE(k));
216
217 if (bkey_cmp(where, &START_KEY(k)) > 0)
218 len = KEY_OFFSET(where) - KEY_START(k);
219
220 bkey_copy_key(k, where);
221
222 BUG_ON(len > KEY_SIZE(k));
223 SET_KEY_SIZE(k, len);
224 return true;
225}
226
227static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
228{
229 return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
230 ~((uint64_t)1 << 63);
231}
232
233/* Tries to merge l and r: l should be lower than r
234 * Returns true if we were able to merge. If we did merge, l will be the merged
235 * key, r will be untouched.
236 */
237bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r)
238{
239 unsigned i;
240
241 if (key_merging_disabled(b->c))
242 return false;
243
244 if (KEY_PTRS(l) != KEY_PTRS(r) ||
245 KEY_DIRTY(l) != KEY_DIRTY(r) ||
246 bkey_cmp(l, &START_KEY(r)))
247 return false;
248
249 for (i = 0; i < KEY_PTRS(l); i++)
250 if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
251 PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
252 return false;
253
254 /* Keys with no pointers aren't restricted to one bucket and could
255 * overflow KEY_SIZE
256 */
257 if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
258 SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
259 SET_KEY_SIZE(l, USHRT_MAX);
260
261 bch_cut_front(l, r);
262 return false;
263 }
264
265 if (KEY_CSUM(l)) {
266 if (KEY_CSUM(r))
267 l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
268 else
269 SET_KEY_CSUM(l, 0);
270 }
271
272 SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
273 SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
274
275 return true;
276}
277
278/* Binary tree stuff for auxiliary search trees */
279
280static unsigned inorder_next(unsigned j, unsigned size)
281{
282 if (j * 2 + 1 < size) {
283 j = j * 2 + 1;
284
285 while (j * 2 < size)
286 j *= 2;
287 } else
288 j >>= ffz(j) + 1;
289
290 return j;
291}
292
293static unsigned inorder_prev(unsigned j, unsigned size)
294{
295 if (j * 2 < size) {
296 j = j * 2;
297
298 while (j * 2 + 1 < size)
299 j = j * 2 + 1;
300 } else
301 j >>= ffs(j);
302
303 return j;
304}
305
306/* I have no idea why this code works... and I'm the one who wrote it
307 *
308 * However, I do know what it does:
309 * Given a binary tree constructed in an array (i.e. how you normally implement
310 * a heap), it converts a node in the tree - referenced by array index - to the
311 * index it would have if you did an inorder traversal.
312 *
313 * Also tested for every j, size up to size somewhere around 6 million.
314 *
315 * The binary tree starts at array index 1, not 0
316 * extra is a function of size:
317 * extra = (size - rounddown_pow_of_two(size - 1)) << 1;
318 */
319static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra)
320{
321 unsigned b = fls(j);
322 unsigned shift = fls(size - 1) - b;
323
324 j ^= 1U << (b - 1);
325 j <<= 1;
326 j |= 1;
327 j <<= shift;
328
329 if (j > extra)
330 j -= (j - extra) >> 1;
331
332 return j;
333}
334
335static unsigned to_inorder(unsigned j, struct bset_tree *t)
336{
337 return __to_inorder(j, t->size, t->extra);
338}
339
340static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra)
341{
342 unsigned shift;
343
344 if (j > extra)
345 j += j - extra;
346
347 shift = ffs(j);
348
349 j >>= shift;
350 j |= roundup_pow_of_two(size) >> shift;
351
352 return j;
353}
354
355static unsigned inorder_to_tree(unsigned j, struct bset_tree *t)
356{
357 return __inorder_to_tree(j, t->size, t->extra);
358}
359
360#if 0
361void inorder_test(void)
362{
363 unsigned long done = 0;
364 ktime_t start = ktime_get();
365
366 for (unsigned size = 2;
367 size < 65536000;
368 size++) {
369 unsigned extra = (size - rounddown_pow_of_two(size - 1)) << 1;
370 unsigned i = 1, j = rounddown_pow_of_two(size - 1);
371
372 if (!(size % 4096))
373 printk(KERN_NOTICE "loop %u, %llu per us\n", size,
374 done / ktime_us_delta(ktime_get(), start));
375
376 while (1) {
377 if (__inorder_to_tree(i, size, extra) != j)
378 panic("size %10u j %10u i %10u", size, j, i);
379
380 if (__to_inorder(j, size, extra) != i)
381 panic("size %10u j %10u i %10u", size, j, i);
382
383 if (j == rounddown_pow_of_two(size) - 1)
384 break;
385
386 BUG_ON(inorder_prev(inorder_next(j, size), size) != j);
387
388 j = inorder_next(j, size);
389 i++;
390 }
391
392 done += size - 1;
393 }
394}
395#endif
396
397/*
398 * Cacheline/offset <-> bkey pointer arithmatic:
399 *
400 * t->tree is a binary search tree in an array; each node corresponds to a key
401 * in one cacheline in t->set (BSET_CACHELINE bytes).
402 *
403 * This means we don't have to store the full index of the key that a node in
404 * the binary tree points to; to_inorder() gives us the cacheline, and then
405 * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes.
406 *
407 * cacheline_to_bkey() and friends abstract out all the pointer arithmatic to
408 * make this work.
409 *
410 * To construct the bfloat for an arbitrary key we need to know what the key
411 * immediately preceding it is: we have to check if the two keys differ in the
412 * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
413 * of the previous key so we can walk backwards to it from t->tree[j]'s key.
414 */
415
416static struct bkey *cacheline_to_bkey(struct bset_tree *t, unsigned cacheline,
417 unsigned offset)
418{
419 return ((void *) t->data) + cacheline * BSET_CACHELINE + offset * 8;
420}
421
422static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
423{
424 return ((void *) k - (void *) t->data) / BSET_CACHELINE;
425}
426
427static unsigned bkey_to_cacheline_offset(struct bkey *k)
428{
429 return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t);
430}
431
432static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
433{
434 return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m);
435}
436
437static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j)
438{
439 return (void *) (((uint64_t *) tree_to_bkey(t, j)) - t->prev[j]);
440}
441
442/*
443 * For the write set - the one we're currently inserting keys into - we don't
444 * maintain a full search tree, we just keep a simple lookup table in t->prev.
445 */
446static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline)
447{
448 return cacheline_to_bkey(t, cacheline, t->prev[cacheline]);
449}
450
451static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
452{
453#ifdef CONFIG_X86_64
454 asm("shrd %[shift],%[high],%[low]"
455 : [low] "+Rm" (low)
456 : [high] "R" (high),
457 [shift] "ci" (shift)
458 : "cc");
459#else
460 low >>= shift;
461 low |= (high << 1) << (63U - shift);
462#endif
463 return low;
464}
465
466static inline unsigned bfloat_mantissa(const struct bkey *k,
467 struct bkey_float *f)
468{
469 const uint64_t *p = &k->low - (f->exponent >> 6);
470 return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK;
471}
472
473static void make_bfloat(struct bset_tree *t, unsigned j)
474{
475 struct bkey_float *f = &t->tree[j];
476 struct bkey *m = tree_to_bkey(t, j);
477 struct bkey *p = tree_to_prev_bkey(t, j);
478
479 struct bkey *l = is_power_of_2(j)
480 ? t->data->start
481 : tree_to_prev_bkey(t, j >> ffs(j));
482
483 struct bkey *r = is_power_of_2(j + 1)
484 ? node(t->data, t->data->keys - bkey_u64s(&t->end))
485 : tree_to_bkey(t, j >> (ffz(j) + 1));
486
487 BUG_ON(m < l || m > r);
488 BUG_ON(bkey_next(p) != m);
489
490 if (KEY_INODE(l) != KEY_INODE(r))
491 f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64;
492 else
493 f->exponent = fls64(r->low ^ l->low);
494
495 f->exponent = max_t(int, f->exponent - BKEY_MANTISSA_BITS, 0);
496
497 /*
498 * Setting f->exponent = 127 flags this node as failed, and causes the
499 * lookup code to fall back to comparing against the original key.
500 */
501
502 if (bfloat_mantissa(m, f) != bfloat_mantissa(p, f))
503 f->mantissa = bfloat_mantissa(m, f) - 1;
504 else
505 f->exponent = 127;
506}
507
508static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
509{
510 if (t != b->sets) {
511 unsigned j = roundup(t[-1].size,
512 64 / sizeof(struct bkey_float));
513
514 t->tree = t[-1].tree + j;
515 t->prev = t[-1].prev + j;
516 }
517
518 while (t < b->sets + MAX_BSETS)
519 t++->size = 0;
520}
521
522static void bset_build_unwritten_tree(struct btree *b)
523{
524 struct bset_tree *t = b->sets + b->nsets;
525
526 bset_alloc_tree(b, t);
527
528 if (t->tree != b->sets->tree + bset_tree_space(b)) {
529 t->prev[0] = bkey_to_cacheline_offset(t->data->start);
530 t->size = 1;
531 }
532}
533
534static void bset_build_written_tree(struct btree *b)
535{
536 struct bset_tree *t = b->sets + b->nsets;
537 struct bkey *k = t->data->start;
538 unsigned j, cacheline = 1;
539
540 bset_alloc_tree(b, t);
541
542 t->size = min_t(unsigned,
543 bkey_to_cacheline(t, end(t->data)),
544 b->sets->tree + bset_tree_space(b) - t->tree);
545
546 if (t->size < 2) {
547 t->size = 0;
548 return;
549 }
550
551 t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
552
553 /* First we figure out where the first key in each cacheline is */
554 for (j = inorder_next(0, t->size);
555 j;
556 j = inorder_next(j, t->size)) {
557 while (bkey_to_cacheline(t, k) != cacheline)
558 k = bkey_next(k);
559
560 t->prev[j] = bkey_u64s(k);
561 k = bkey_next(k);
562 cacheline++;
563 t->tree[j].m = bkey_to_cacheline_offset(k);
564 }
565
566 while (bkey_next(k) != end(t->data))
567 k = bkey_next(k);
568
569 t->end = *k;
570
571 /* Then we build the tree */
572 for (j = inorder_next(0, t->size);
573 j;
574 j = inorder_next(j, t->size))
575 make_bfloat(t, j);
576}
577
578void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k)
579{
580 struct bset_tree *t;
581 unsigned inorder, j = 1;
582
583 for (t = b->sets; t <= &b->sets[b->nsets]; t++)
584 if (k < end(t->data))
585 goto found_set;
586
587 BUG();
588found_set:
589 if (!t->size || !bset_written(b, t))
590 return;
591
592 inorder = bkey_to_cacheline(t, k);
593
594 if (k == t->data->start)
595 goto fix_left;
596
597 if (bkey_next(k) == end(t->data)) {
598 t->end = *k;
599 goto fix_right;
600 }
601
602 j = inorder_to_tree(inorder, t);
603
604 if (j &&
605 j < t->size &&
606 k == tree_to_bkey(t, j))
607fix_left: do {
608 make_bfloat(t, j);
609 j = j * 2;
610 } while (j < t->size);
611
612 j = inorder_to_tree(inorder + 1, t);
613
614 if (j &&
615 j < t->size &&
616 k == tree_to_prev_bkey(t, j))
617fix_right: do {
618 make_bfloat(t, j);
619 j = j * 2 + 1;
620 } while (j < t->size);
621}
622
623void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
624{
625 struct bset_tree *t = &b->sets[b->nsets];
626 unsigned shift = bkey_u64s(k);
627 unsigned j = bkey_to_cacheline(t, k);
628
629 /* We're getting called from btree_split() or btree_gc, just bail out */
630 if (!t->size)
631 return;
632
633 /* k is the key we just inserted; we need to find the entry in the
634 * lookup table for the first key that is strictly greater than k:
635 * it's either k's cacheline or the next one
636 */
637 if (j < t->size &&
638 table_to_bkey(t, j) <= k)
639 j++;
640
641 /* Adjust all the lookup table entries, and find a new key for any that
642 * have gotten too big
643 */
644 for (; j < t->size; j++) {
645 t->prev[j] += shift;
646
647 if (t->prev[j] > 7) {
648 k = table_to_bkey(t, j - 1);
649
650 while (k < cacheline_to_bkey(t, j, 0))
651 k = bkey_next(k);
652
653 t->prev[j] = bkey_to_cacheline_offset(k);
654 }
655 }
656
657 if (t->size == b->sets->tree + bset_tree_space(b) - t->tree)
658 return;
659
660 /* Possibly add a new entry to the end of the lookup table */
661
662 for (k = table_to_bkey(t, t->size - 1);
663 k != end(t->data);
664 k = bkey_next(k))
665 if (t->size == bkey_to_cacheline(t, k)) {
666 t->prev[t->size] = bkey_to_cacheline_offset(k);
667 t->size++;
668 }
669}
670
671void bch_bset_init_next(struct btree *b)
672{
673 struct bset *i = write_block(b);
674
675 if (i != b->sets[0].data) {
676 b->sets[++b->nsets].data = i;
677 i->seq = b->sets[0].data->seq;
678 } else
679 get_random_bytes(&i->seq, sizeof(uint64_t));
680
681 i->magic = bset_magic(b->c);
682 i->version = 0;
683 i->keys = 0;
684
685 bset_build_unwritten_tree(b);
686}
687
688struct bset_search_iter {
689 struct bkey *l, *r;
690};
691
692static struct bset_search_iter bset_search_write_set(struct btree *b,
693 struct bset_tree *t,
694 const struct bkey *search)
695{
696 unsigned li = 0, ri = t->size;
697
698 BUG_ON(!b->nsets &&
699 t->size < bkey_to_cacheline(t, end(t->data)));
700
701 while (li + 1 != ri) {
702 unsigned m = (li + ri) >> 1;
703
704 if (bkey_cmp(table_to_bkey(t, m), search) > 0)
705 ri = m;
706 else
707 li = m;
708 }
709
710 return (struct bset_search_iter) {
711 table_to_bkey(t, li),
712 ri < t->size ? table_to_bkey(t, ri) : end(t->data)
713 };
714}
715
716static struct bset_search_iter bset_search_tree(struct btree *b,
717 struct bset_tree *t,
718 const struct bkey *search)
719{
720 struct bkey *l, *r;
721 struct bkey_float *f;
722 unsigned inorder, j, n = 1;
723
724 do {
725 unsigned p = n << 4;
726 p &= ((int) (p - t->size)) >> 31;
727
728 prefetch(&t->tree[p]);
729
730 j = n;
731 f = &t->tree[j];
732
733 /*
734 * n = (f->mantissa > bfloat_mantissa())
735 * ? j * 2
736 * : j * 2 + 1;
737 *
738 * We need to subtract 1 from f->mantissa for the sign bit trick
739 * to work - that's done in make_bfloat()
740 */
741 if (likely(f->exponent != 127))
742 n = j * 2 + (((unsigned)
743 (f->mantissa -
744 bfloat_mantissa(search, f))) >> 31);
745 else
746 n = (bkey_cmp(tree_to_bkey(t, j), search) > 0)
747 ? j * 2
748 : j * 2 + 1;
749 } while (n < t->size);
750
751 inorder = to_inorder(j, t);
752
753 /*
754 * n would have been the node we recursed to - the low bit tells us if
755 * we recursed left or recursed right.
756 */
757 if (n & 1) {
758 l = cacheline_to_bkey(t, inorder, f->m);
759
760 if (++inorder != t->size) {
761 f = &t->tree[inorder_next(j, t->size)];
762 r = cacheline_to_bkey(t, inorder, f->m);
763 } else
764 r = end(t->data);
765 } else {
766 r = cacheline_to_bkey(t, inorder, f->m);
767
768 if (--inorder) {
769 f = &t->tree[inorder_prev(j, t->size)];
770 l = cacheline_to_bkey(t, inorder, f->m);
771 } else
772 l = t->data->start;
773 }
774
775 return (struct bset_search_iter) {l, r};
776}
777
778struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
779 const struct bkey *search)
780{
781 struct bset_search_iter i;
782
783 /*
784 * First, we search for a cacheline, then lastly we do a linear search
785 * within that cacheline.
786 *
787 * To search for the cacheline, there's three different possibilities:
788 * * The set is too small to have a search tree, so we just do a linear
789 * search over the whole set.
790 * * The set is the one we're currently inserting into; keeping a full
791 * auxiliary search tree up to date would be too expensive, so we
792 * use a much simpler lookup table to do a binary search -
793 * bset_search_write_set().
794 * * Or we use the auxiliary search tree we constructed earlier -
795 * bset_search_tree()
796 */
797
798 if (unlikely(!t->size)) {
799 i.l = t->data->start;
800 i.r = end(t->data);
801 } else if (bset_written(b, t)) {
802 /*
803 * Each node in the auxiliary search tree covers a certain range
804 * of bits, and keys above and below the set it covers might
805 * differ outside those bits - so we have to special case the
806 * start and end - handle that here:
807 */
808
809 if (unlikely(bkey_cmp(search, &t->end) >= 0))
810 return end(t->data);
811
812 if (unlikely(bkey_cmp(search, t->data->start) < 0))
813 return t->data->start;
814
815 i = bset_search_tree(b, t, search);
816 } else
817 i = bset_search_write_set(b, t, search);
818
819#ifdef CONFIG_BCACHE_EDEBUG
820 BUG_ON(bset_written(b, t) &&
821 i.l != t->data->start &&
822 bkey_cmp(tree_to_prev_bkey(t,
823 inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
824 search) > 0);
825
826 BUG_ON(i.r != end(t->data) &&
827 bkey_cmp(i.r, search) <= 0);
828#endif
829
830 while (likely(i.l != i.r) &&
831 bkey_cmp(i.l, search) <= 0)
832 i.l = bkey_next(i.l);
833
834 return i.l;
835}
836
837/* Btree iterator */
838
839static inline bool btree_iter_cmp(struct btree_iter_set l,
840 struct btree_iter_set r)
841{
842 int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
843
844 return c ? c > 0 : l.k < r.k;
845}
846
847static inline bool btree_iter_end(struct btree_iter *iter)
848{
849 return !iter->used;
850}
851
852void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
853 struct bkey *end)
854{
855 if (k != end)
856 BUG_ON(!heap_add(iter,
857 ((struct btree_iter_set) { k, end }),
858 btree_iter_cmp));
859}
860
861struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
862 struct bkey *search, struct bset_tree *start)
863{
864 struct bkey *ret = NULL;
865 iter->size = ARRAY_SIZE(iter->data);
866 iter->used = 0;
867
868 for (; start <= &b->sets[b->nsets]; start++) {
869 ret = bch_bset_search(b, start, search);
870 bch_btree_iter_push(iter, ret, end(start->data));
871 }
872
873 return ret;
874}
875
876struct bkey *bch_btree_iter_next(struct btree_iter *iter)
877{
878 struct btree_iter_set unused;
879 struct bkey *ret = NULL;
880
881 if (!btree_iter_end(iter)) {
882 ret = iter->data->k;
883 iter->data->k = bkey_next(iter->data->k);
884
885 if (iter->data->k > iter->data->end) {
886 WARN_ONCE(1, "bset was corrupt!\n");
887 iter->data->k = iter->data->end;
888 }
889
890 if (iter->data->k == iter->data->end)
891 heap_pop(iter, unused, btree_iter_cmp);
892 else
893 heap_sift(iter, 0, btree_iter_cmp);
894 }
895
896 return ret;
897}
898
899struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
900 struct btree *b, ptr_filter_fn fn)
901{
902 struct bkey *ret;
903
904 do {
905 ret = bch_btree_iter_next(iter);
906 } while (ret && fn(b, ret));
907
908 return ret;
909}
910
911struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search)
912{
913 struct btree_iter iter;
914
915 bch_btree_iter_init(b, &iter, search);
916 return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
917}
918
919/* Mergesort */
920
921static void btree_sort_fixup(struct btree_iter *iter)
922{
923 while (iter->used > 1) {
924 struct btree_iter_set *top = iter->data, *i = top + 1;
925 struct bkey *k;
926
927 if (iter->used > 2 &&
928 btree_iter_cmp(i[0], i[1]))
929 i++;
930
931 for (k = i->k;
932 k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0;
933 k = bkey_next(k))
934 if (top->k > i->k)
935 __bch_cut_front(top->k, k);
936 else if (KEY_SIZE(k))
937 bch_cut_back(&START_KEY(k), top->k);
938
939 if (top->k < i->k || k == i->k)
940 break;
941
942 heap_sift(iter, i - top, btree_iter_cmp);
943 }
944}
945
946static void btree_mergesort(struct btree *b, struct bset *out,
947 struct btree_iter *iter,
948 bool fixup, bool remove_stale)
949{
950 struct bkey *k, *last = NULL;
951 bool (*bad)(struct btree *, const struct bkey *) = remove_stale
952 ? bch_ptr_bad
953 : bch_ptr_invalid;
954
955 while (!btree_iter_end(iter)) {
956 if (fixup && !b->level)
957 btree_sort_fixup(iter);
958
959 k = bch_btree_iter_next(iter);
960 if (bad(b, k))
961 continue;
962
963 if (!last) {
964 last = out->start;
965 bkey_copy(last, k);
966 } else if (b->level ||
967 !bch_bkey_try_merge(b, last, k)) {
968 last = bkey_next(last);
969 bkey_copy(last, k);
970 }
971 }
972
973 out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0;
974
975 pr_debug("sorted %i keys", out->keys);
976 bch_check_key_order(b, out);
977}
978
979static void __btree_sort(struct btree *b, struct btree_iter *iter,
980 unsigned start, unsigned order, bool fixup)
981{
982 uint64_t start_time;
983 bool remove_stale = !b->written;
984 struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
985 order);
986 if (!out) {
987 mutex_lock(&b->c->sort_lock);
988 out = b->c->sort;
989 order = ilog2(bucket_pages(b->c));
990 }
991
992 start_time = local_clock();
993
994 btree_mergesort(b, out, iter, fixup, remove_stale);
995 b->nsets = start;
996
997 if (!fixup && !start && b->written)
998 bch_btree_verify(b, out);
999
1000 if (!start && order == b->page_order) {
1001 /*
1002 * Our temporary buffer is the same size as the btree node's
1003 * buffer, we can just swap buffers instead of doing a big
1004 * memcpy()
1005 */
1006
1007 out->magic = bset_magic(b->c);
1008 out->seq = b->sets[0].data->seq;
1009 out->version = b->sets[0].data->version;
1010 swap(out, b->sets[0].data);
1011
1012 if (b->c->sort == b->sets[0].data)
1013 b->c->sort = out;
1014 } else {
1015 b->sets[start].data->keys = out->keys;
1016 memcpy(b->sets[start].data->start, out->start,
1017 (void *) end(out) - (void *) out->start);
1018 }
1019
1020 if (out == b->c->sort)
1021 mutex_unlock(&b->c->sort_lock);
1022 else
1023 free_pages((unsigned long) out, order);
1024
1025 if (b->written)
1026 bset_build_written_tree(b);
1027
1028 if (!start) {
1029 spin_lock(&b->c->sort_time_lock);
1030 bch_time_stats_update(&b->c->sort_time, start_time);
1031 spin_unlock(&b->c->sort_time_lock);
1032 }
1033}
1034
1035void bch_btree_sort_partial(struct btree *b, unsigned start)
1036{
1037 size_t oldsize = 0, order = b->page_order, keys = 0;
1038 struct btree_iter iter;
1039 __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]);
1040
1041 BUG_ON(b->sets[b->nsets].data == write_block(b) &&
1042 (b->sets[b->nsets].size || b->nsets));
1043
1044 if (b->written)
1045 oldsize = bch_count_data(b);
1046
1047 if (start) {
1048 unsigned i;
1049
1050 for (i = start; i <= b->nsets; i++)
1051 keys += b->sets[i].data->keys;
1052
1053 order = roundup_pow_of_two(__set_bytes(b->sets->data,
1054 keys)) / PAGE_SIZE;
1055 if (order)
1056 order = ilog2(order);
1057 }
1058
1059 __btree_sort(b, &iter, start, order, false);
1060
1061 EBUG_ON(b->written && bch_count_data(b) != oldsize);
1062}
1063
1064void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter)
1065{
1066 BUG_ON(!b->written);
1067 __btree_sort(b, iter, 0, b->page_order, true);
1068}
1069
1070void bch_btree_sort_into(struct btree *b, struct btree *new)
1071{
1072 uint64_t start_time = local_clock();
1073
1074 struct btree_iter iter;
1075 bch_btree_iter_init(b, &iter, NULL);
1076
1077 btree_mergesort(b, new->sets->data, &iter, false, true);
1078
1079 spin_lock(&b->c->sort_time_lock);
1080 bch_time_stats_update(&b->c->sort_time, start_time);
1081 spin_unlock(&b->c->sort_time_lock);
1082
1083 bkey_copy_key(&new->key, &b->key);
1084 new->sets->size = 0;
1085}
1086
1087void bch_btree_sort_lazy(struct btree *b)
1088{
1089 if (b->nsets) {
1090 unsigned i, j, keys = 0, total;
1091
1092 for (i = 0; i <= b->nsets; i++)
1093 keys += b->sets[i].data->keys;
1094
1095 total = keys;
1096
1097 for (j = 0; j < b->nsets; j++) {
1098 if (keys * 2 < total ||
1099 keys < 1000) {
1100 bch_btree_sort_partial(b, j);
1101 return;
1102 }
1103
1104 keys -= b->sets[j].data->keys;
1105 }
1106
1107 /* Must sort if b->nsets == 3 or we'll overflow */
1108 if (b->nsets >= (MAX_BSETS - 1) - b->level) {
1109 bch_btree_sort(b);
1110 return;
1111 }
1112 }
1113
1114 bset_build_written_tree(b);
1115}
1116
1117/* Sysfs stuff */
1118
1119struct bset_stats {
1120 size_t nodes;
1121 size_t sets_written, sets_unwritten;
1122 size_t bytes_written, bytes_unwritten;
1123 size_t floats, failed;
1124};
1125
1126static int bch_btree_bset_stats(struct btree *b, struct btree_op *op,
1127 struct bset_stats *stats)
1128{
1129 struct bkey *k;
1130 unsigned i;
1131
1132 stats->nodes++;
1133
1134 for (i = 0; i <= b->nsets; i++) {
1135 struct bset_tree *t = &b->sets[i];
1136 size_t bytes = t->data->keys * sizeof(uint64_t);
1137 size_t j;
1138
1139 if (bset_written(b, t)) {
1140 stats->sets_written++;
1141 stats->bytes_written += bytes;
1142
1143 stats->floats += t->size - 1;
1144
1145 for (j = 1; j < t->size; j++)
1146 if (t->tree[j].exponent == 127)
1147 stats->failed++;
1148 } else {
1149 stats->sets_unwritten++;
1150 stats->bytes_unwritten += bytes;
1151 }
1152 }
1153
1154 if (b->level) {
1155 struct btree_iter iter;
1156
1157 for_each_key_filter(b, k, &iter, bch_ptr_bad) {
1158 int ret = btree(bset_stats, k, b, op, stats);
1159 if (ret)
1160 return ret;
1161 }
1162 }
1163
1164 return 0;
1165}
1166
1167int bch_bset_print_stats(struct cache_set *c, char *buf)
1168{
1169 struct btree_op op;
1170 struct bset_stats t;
1171 int ret;
1172
1173 bch_btree_op_init_stack(&op);
1174 memset(&t, 0, sizeof(struct bset_stats));
1175
1176 ret = btree_root(bset_stats, c, &op, &t);
1177 if (ret)
1178 return ret;
1179
1180 return snprintf(buf, PAGE_SIZE,
1181 "btree nodes: %zu\n"
1182 "written sets: %zu\n"
1183 "unwritten sets: %zu\n"
1184 "written key bytes: %zu\n"
1185 "unwritten key bytes: %zu\n"
1186 "floats: %zu\n"
1187 "failed: %zu\n",
1188 t.nodes,
1189 t.sets_written, t.sets_unwritten,
1190 t.bytes_written, t.bytes_unwritten,
1191 t.floats, t.failed);
1192}
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
new file mode 100644
index 000000000000..57a9cff41546
--- /dev/null
+++ b/drivers/md/bcache/bset.h
@@ -0,0 +1,379 @@
1#ifndef _BCACHE_BSET_H
2#define _BCACHE_BSET_H
3
4/*
5 * BKEYS:
6 *
7 * A bkey contains a key, a size field, a variable number of pointers, and some
8 * ancillary flag bits.
9 *
10 * We use two different functions for validating bkeys, bch_ptr_invalid and
11 * bch_ptr_bad().
12 *
13 * bch_ptr_invalid() primarily filters out keys and pointers that would be
14 * invalid due to some sort of bug, whereas bch_ptr_bad() filters out keys and
15 * pointer that occur in normal practice but don't point to real data.
16 *
17 * The one exception to the rule that ptr_invalid() filters out invalid keys is
18 * that it also filters out keys of size 0 - these are keys that have been
19 * completely overwritten. It'd be safe to delete these in memory while leaving
20 * them on disk, just unnecessary work - so we filter them out when resorting
21 * instead.
22 *
23 * We can't filter out stale keys when we're resorting, because garbage
24 * collection needs to find them to ensure bucket gens don't wrap around -
25 * unless we're rewriting the btree node those stale keys still exist on disk.
26 *
27 * We also implement functions here for removing some number of sectors from the
28 * front or the back of a bkey - this is mainly used for fixing overlapping
29 * extents, by removing the overlapping sectors from the older key.
30 *
31 * BSETS:
32 *
33 * A bset is an array of bkeys laid out contiguously in memory in sorted order,
34 * along with a header. A btree node is made up of a number of these, written at
35 * different times.
36 *
37 * There could be many of them on disk, but we never allow there to be more than
38 * 4 in memory - we lazily resort as needed.
39 *
40 * We implement code here for creating and maintaining auxiliary search trees
41 * (described below) for searching an individial bset, and on top of that we
42 * implement a btree iterator.
43 *
44 * BTREE ITERATOR:
45 *
46 * Most of the code in bcache doesn't care about an individual bset - it needs
47 * to search entire btree nodes and iterate over them in sorted order.
48 *
49 * The btree iterator code serves both functions; it iterates through the keys
50 * in a btree node in sorted order, starting from either keys after a specific
51 * point (if you pass it a search key) or the start of the btree node.
52 *
53 * AUXILIARY SEARCH TREES:
54 *
55 * Since keys are variable length, we can't use a binary search on a bset - we
56 * wouldn't be able to find the start of the next key. But binary searches are
57 * slow anyways, due to terrible cache behaviour; bcache originally used binary
58 * searches and that code topped out at under 50k lookups/second.
59 *
60 * So we need to construct some sort of lookup table. Since we only insert keys
61 * into the last (unwritten) set, most of the keys within a given btree node are
62 * usually in sets that are mostly constant. We use two different types of
63 * lookup tables to take advantage of this.
64 *
65 * Both lookup tables share in common that they don't index every key in the
66 * set; they index one key every BSET_CACHELINE bytes, and then a linear search
67 * is used for the rest.
68 *
69 * For sets that have been written to disk and are no longer being inserted
70 * into, we construct a binary search tree in an array - traversing a binary
71 * search tree in an array gives excellent locality of reference and is very
72 * fast, since both children of any node are adjacent to each other in memory
73 * (and their grandchildren, and great grandchildren...) - this means
74 * prefetching can be used to great effect.
75 *
76 * It's quite useful performance wise to keep these nodes small - not just
77 * because they're more likely to be in L2, but also because we can prefetch
78 * more nodes on a single cacheline and thus prefetch more iterations in advance
79 * when traversing this tree.
80 *
81 * Nodes in the auxiliary search tree must contain both a key to compare against
82 * (we don't want to fetch the key from the set, that would defeat the purpose),
83 * and a pointer to the key. We use a few tricks to compress both of these.
84 *
85 * To compress the pointer, we take advantage of the fact that one node in the
86 * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
87 * a function (to_inorder()) that takes the index of a node in a binary tree and
88 * returns what its index would be in an inorder traversal, so we only have to
89 * store the low bits of the offset.
90 *
91 * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
92 * compress that, we take advantage of the fact that when we're traversing the
93 * search tree at every iteration we know that both our search key and the key
94 * we're looking for lie within some range - bounded by our previous
95 * comparisons. (We special case the start of a search so that this is true even
96 * at the root of the tree).
97 *
98 * So we know the key we're looking for is between a and b, and a and b don't
99 * differ higher than bit 50, we don't need to check anything higher than bit
100 * 50.
101 *
102 * We don't usually need the rest of the bits, either; we only need enough bits
103 * to partition the key range we're currently checking. Consider key n - the
104 * key our auxiliary search tree node corresponds to, and key p, the key
105 * immediately preceding n. The lowest bit we need to store in the auxiliary
106 * search tree is the highest bit that differs between n and p.
107 *
108 * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
109 * comparison. But we'd really like our nodes in the auxiliary search tree to be
110 * of fixed size.
111 *
112 * The solution is to make them fixed size, and when we're constructing a node
113 * check if p and n differed in the bits we needed them to. If they don't we
114 * flag that node, and when doing lookups we fallback to comparing against the
115 * real key. As long as this doesn't happen to often (and it seems to reliably
116 * happen a bit less than 1% of the time), we win - even on failures, that key
117 * is then more likely to be in cache than if we were doing binary searches all
118 * the way, since we're touching so much less memory.
119 *
120 * The keys in the auxiliary search tree are stored in (software) floating
121 * point, with an exponent and a mantissa. The exponent needs to be big enough
122 * to address all the bits in the original key, but the number of bits in the
123 * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
124 *
125 * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
126 * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
127 * We need one node per 128 bytes in the btree node, which means the auxiliary
128 * search trees take up 3% as much memory as the btree itself.
129 *
130 * Constructing these auxiliary search trees is moderately expensive, and we
131 * don't want to be constantly rebuilding the search tree for the last set
132 * whenever we insert another key into it. For the unwritten set, we use a much
133 * simpler lookup table - it's just a flat array, so index i in the lookup table
134 * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
135 * within each byte range works the same as with the auxiliary search trees.
136 *
137 * These are much easier to keep up to date when we insert a key - we do it
138 * somewhat lazily; when we shift a key up we usually just increment the pointer
139 * to it, only when it would overflow do we go to the trouble of finding the
140 * first key in that range of bytes again.
141 */
142
143/* Btree key comparison/iteration */
144
145struct btree_iter {
146 size_t size, used;
147 struct btree_iter_set {
148 struct bkey *k, *end;
149 } data[MAX_BSETS];
150};
151
152struct bset_tree {
153 /*
154 * We construct a binary tree in an array as if the array
155 * started at 1, so that things line up on the same cachelines
156 * better: see comments in bset.c at cacheline_to_bkey() for
157 * details
158 */
159
160 /* size of the binary tree and prev array */
161 unsigned size;
162
163 /* function of size - precalculated for to_inorder() */
164 unsigned extra;
165
166 /* copy of the last key in the set */
167 struct bkey end;
168 struct bkey_float *tree;
169
170 /*
171 * The nodes in the bset tree point to specific keys - this
172 * array holds the sizes of the previous key.
173 *
174 * Conceptually it's a member of struct bkey_float, but we want
175 * to keep bkey_float to 4 bytes and prev isn't used in the fast
176 * path.
177 */
178 uint8_t *prev;
179
180 /* The actual btree node, with pointers to each sorted set */
181 struct bset *data;
182};
183
184static __always_inline int64_t bkey_cmp(const struct bkey *l,
185 const struct bkey *r)
186{
187 return unlikely(KEY_INODE(l) != KEY_INODE(r))
188 ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r)
189 : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
190}
191
192static inline size_t bkey_u64s(const struct bkey *k)
193{
194 BUG_ON(KEY_CSUM(k) > 1);
195 return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0);
196}
197
198static inline size_t bkey_bytes(const struct bkey *k)
199{
200 return bkey_u64s(k) * sizeof(uint64_t);
201}
202
203static inline void bkey_copy(struct bkey *dest, const struct bkey *src)
204{
205 memcpy(dest, src, bkey_bytes(src));
206}
207
208static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
209{
210 if (!src)
211 src = &KEY(0, 0, 0);
212
213 SET_KEY_INODE(dest, KEY_INODE(src));
214 SET_KEY_OFFSET(dest, KEY_OFFSET(src));
215}
216
217static inline struct bkey *bkey_next(const struct bkey *k)
218{
219 uint64_t *d = (void *) k;
220 return (struct bkey *) (d + bkey_u64s(k));
221}
222
223/* Keylists */
224
225struct keylist {
226 struct bkey *top;
227 union {
228 uint64_t *list;
229 struct bkey *bottom;
230 };
231
232 /* Enough room for btree_split's keys without realloc */
233#define KEYLIST_INLINE 16
234 uint64_t d[KEYLIST_INLINE];
235};
236
237static inline void bch_keylist_init(struct keylist *l)
238{
239 l->top = (void *) (l->list = l->d);
240}
241
242static inline void bch_keylist_push(struct keylist *l)
243{
244 l->top = bkey_next(l->top);
245}
246
247static inline void bch_keylist_add(struct keylist *l, struct bkey *k)
248{
249 bkey_copy(l->top, k);
250 bch_keylist_push(l);
251}
252
253static inline bool bch_keylist_empty(struct keylist *l)
254{
255 return l->top == (void *) l->list;
256}
257
258static inline void bch_keylist_free(struct keylist *l)
259{
260 if (l->list != l->d)
261 kfree(l->list);
262}
263
264void bch_keylist_copy(struct keylist *, struct keylist *);
265struct bkey *bch_keylist_pop(struct keylist *);
266int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
267
268void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
269 unsigned);
270bool __bch_cut_front(const struct bkey *, struct bkey *);
271bool __bch_cut_back(const struct bkey *, struct bkey *);
272
273static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
274{
275 BUG_ON(bkey_cmp(where, k) > 0);
276 return __bch_cut_front(where, k);
277}
278
279static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
280{
281 BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
282 return __bch_cut_back(where, k);
283}
284
285const char *bch_ptr_status(struct cache_set *, const struct bkey *);
286bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *);
287bool bch_ptr_bad(struct btree *, const struct bkey *);
288
289static inline uint8_t gen_after(uint8_t a, uint8_t b)
290{
291 uint8_t r = a - b;
292 return r > 128U ? 0 : r;
293}
294
295static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
296 unsigned i)
297{
298 return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
299}
300
301static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
302 unsigned i)
303{
304 return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
305}
306
307
308typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
309
310struct bkey *bch_next_recurse_key(struct btree *, struct bkey *);
311struct bkey *bch_btree_iter_next(struct btree_iter *);
312struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
313 struct btree *, ptr_filter_fn);
314
315void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
316struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *,
317 struct bkey *, struct bset_tree *);
318
319/* 32 bits total: */
320#define BKEY_MID_BITS 3
321#define BKEY_EXPONENT_BITS 7
322#define BKEY_MANTISSA_BITS 22
323#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1)
324
325struct bkey_float {
326 unsigned exponent:BKEY_EXPONENT_BITS;
327 unsigned m:BKEY_MID_BITS;
328 unsigned mantissa:BKEY_MANTISSA_BITS;
329} __packed;
330
331/*
332 * BSET_CACHELINE was originally intended to match the hardware cacheline size -
333 * it used to be 64, but I realized the lookup code would touch slightly less
334 * memory if it was 128.
335 *
336 * It definites the number of bytes (in struct bset) per struct bkey_float in
337 * the auxiliar search tree - when we're done searching the bset_float tree we
338 * have this many bytes left that we do a linear search over.
339 *
340 * Since (after level 5) every level of the bset_tree is on a new cacheline,
341 * we're touching one fewer cacheline in the bset tree in exchange for one more
342 * cacheline in the linear search - but the linear search might stop before it
343 * gets to the second cacheline.
344 */
345
346#define BSET_CACHELINE 128
347#define bset_tree_space(b) (btree_data_space(b) / BSET_CACHELINE)
348
349#define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float))
350#define bset_prev_bytes(b) (bset_tree_space(b) * sizeof(uint8_t))
351
352void bch_bset_init_next(struct btree *);
353
354void bch_bset_fix_invalidated_key(struct btree *, struct bkey *);
355void bch_bset_fix_lookup_table(struct btree *, struct bkey *);
356
357struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
358 const struct bkey *);
359
360static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
361 const struct bkey *search)
362{
363 return search ? __bch_bset_search(b, t, search) : t->data->start;
364}
365
366bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
367void bch_btree_sort_lazy(struct btree *);
368void bch_btree_sort_into(struct btree *, struct btree *);
369void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *);
370void bch_btree_sort_partial(struct btree *, unsigned);
371
372static inline void bch_btree_sort(struct btree *b)
373{
374 bch_btree_sort_partial(b, 0);
375}
376
377int bch_bset_print_stats(struct cache_set *, char *);
378
379#endif
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
new file mode 100644
index 000000000000..7a5658f04e62
--- /dev/null
+++ b/drivers/md/bcache/btree.c
@@ -0,0 +1,2503 @@
1/*
2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3 *
4 * Uses a block device as cache for other block devices; optimized for SSDs.
5 * All allocation is done in buckets, which should match the erase block size
6 * of the device.
7 *
8 * Buckets containing cached data are kept on a heap sorted by priority;
9 * bucket priority is increased on cache hit, and periodically all the buckets
10 * on the heap have their priority scaled down. This currently is just used as
11 * an LRU but in the future should allow for more intelligent heuristics.
12 *
13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
14 * counter. Garbage collection is used to remove stale pointers.
15 *
16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
17 * as keys are inserted we only sort the pages that have not yet been written.
18 * When garbage collection is run, we resort the entire node.
19 *
20 * All configuration is done via sysfs; see Documentation/bcache.txt.
21 */
22
23#include "bcache.h"
24#include "btree.h"
25#include "debug.h"
26#include "request.h"
27
28#include <linux/slab.h>
29#include <linux/bitops.h>
30#include <linux/hash.h>
31#include <linux/prefetch.h>
32#include <linux/random.h>
33#include <linux/rcupdate.h>
34#include <trace/events/bcache.h>
35
36/*
37 * Todo:
38 * register_bcache: Return errors out to userspace correctly
39 *
40 * Writeback: don't undirty key until after a cache flush
41 *
42 * Create an iterator for key pointers
43 *
44 * On btree write error, mark bucket such that it won't be freed from the cache
45 *
46 * Journalling:
47 * Check for bad keys in replay
48 * Propagate barriers
49 * Refcount journal entries in journal_replay
50 *
51 * Garbage collection:
52 * Finish incremental gc
53 * Gc should free old UUIDs, data for invalid UUIDs
54 *
55 * Provide a way to list backing device UUIDs we have data cached for, and
56 * probably how long it's been since we've seen them, and a way to invalidate
57 * dirty data for devices that will never be attached again
58 *
59 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
60 * that based on that and how much dirty data we have we can keep writeback
61 * from being starved
62 *
63 * Add a tracepoint or somesuch to watch for writeback starvation
64 *
65 * When btree depth > 1 and splitting an interior node, we have to make sure
66 * alloc_bucket() cannot fail. This should be true but is not completely
67 * obvious.
68 *
69 * Make sure all allocations get charged to the root cgroup
70 *
71 * Plugging?
72 *
73 * If data write is less than hard sector size of ssd, round up offset in open
74 * bucket to the next whole sector
75 *
76 * Also lookup by cgroup in get_open_bucket()
77 *
78 * Superblock needs to be fleshed out for multiple cache devices
79 *
80 * Add a sysfs tunable for the number of writeback IOs in flight
81 *
82 * Add a sysfs tunable for the number of open data buckets
83 *
84 * IO tracking: Can we track when one process is doing io on behalf of another?
85 * IO tracking: Don't use just an average, weigh more recent stuff higher
86 *
87 * Test module load/unload
88 */
89
90static const char * const op_types[] = {
91 "insert", "replace"
92};
93
94static const char *op_type(struct btree_op *op)
95{
96 return op_types[op->type];
97}
98
99#define MAX_NEED_GC 64
100#define MAX_SAVE_PRIO 72
101
102#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
103
104#define PTR_HASH(c, k) \
105 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
106
107struct workqueue_struct *bch_gc_wq;
108static struct workqueue_struct *btree_io_wq;
109
110void bch_btree_op_init_stack(struct btree_op *op)
111{
112 memset(op, 0, sizeof(struct btree_op));
113 closure_init_stack(&op->cl);
114 op->lock = -1;
115 bch_keylist_init(&op->keys);
116}
117
118/* Btree key manipulation */
119
120static void bkey_put(struct cache_set *c, struct bkey *k, int level)
121{
122 if ((level && KEY_OFFSET(k)) || !level)
123 __bkey_put(c, k);
124}
125
126/* Btree IO */
127
128static uint64_t btree_csum_set(struct btree *b, struct bset *i)
129{
130 uint64_t crc = b->key.ptr[0];
131 void *data = (void *) i + 8, *end = end(i);
132
133 crc = bch_crc64_update(crc, data, end - data);
134 return crc ^ 0xffffffffffffffffULL;
135}
136
137static void btree_bio_endio(struct bio *bio, int error)
138{
139 struct closure *cl = bio->bi_private;
140 struct btree *b = container_of(cl, struct btree, io.cl);
141
142 if (error)
143 set_btree_node_io_error(b);
144
145 bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE)
146 ? "writing btree" : "reading btree");
147 closure_put(cl);
148}
149
150static void btree_bio_init(struct btree *b)
151{
152 BUG_ON(b->bio);
153 b->bio = bch_bbio_alloc(b->c);
154
155 b->bio->bi_end_io = btree_bio_endio;
156 b->bio->bi_private = &b->io.cl;
157}
158
159void bch_btree_read_done(struct closure *cl)
160{
161 struct btree *b = container_of(cl, struct btree, io.cl);
162 struct bset *i = b->sets[0].data;
163 struct btree_iter *iter = b->c->fill_iter;
164 const char *err = "bad btree header";
165 BUG_ON(b->nsets || b->written);
166
167 bch_bbio_free(b->bio, b->c);
168 b->bio = NULL;
169
170 mutex_lock(&b->c->fill_lock);
171 iter->used = 0;
172
173 if (btree_node_io_error(b) ||
174 !i->seq)
175 goto err;
176
177 for (;
178 b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
179 i = write_block(b)) {
180 err = "unsupported bset version";
181 if (i->version > BCACHE_BSET_VERSION)
182 goto err;
183
184 err = "bad btree header";
185 if (b->written + set_blocks(i, b->c) > btree_blocks(b))
186 goto err;
187
188 err = "bad magic";
189 if (i->magic != bset_magic(b->c))
190 goto err;
191
192 err = "bad checksum";
193 switch (i->version) {
194 case 0:
195 if (i->csum != csum_set(i))
196 goto err;
197 break;
198 case BCACHE_BSET_VERSION:
199 if (i->csum != btree_csum_set(b, i))
200 goto err;
201 break;
202 }
203
204 err = "empty set";
205 if (i != b->sets[0].data && !i->keys)
206 goto err;
207
208 bch_btree_iter_push(iter, i->start, end(i));
209
210 b->written += set_blocks(i, b->c);
211 }
212
213 err = "corrupted btree";
214 for (i = write_block(b);
215 index(i, b) < btree_blocks(b);
216 i = ((void *) i) + block_bytes(b->c))
217 if (i->seq == b->sets[0].data->seq)
218 goto err;
219
220 bch_btree_sort_and_fix_extents(b, iter);
221
222 i = b->sets[0].data;
223 err = "short btree key";
224 if (b->sets[0].size &&
225 bkey_cmp(&b->key, &b->sets[0].end) < 0)
226 goto err;
227
228 if (b->written < btree_blocks(b))
229 bch_bset_init_next(b);
230out:
231
232 mutex_unlock(&b->c->fill_lock);
233
234 spin_lock(&b->c->btree_read_time_lock);
235 bch_time_stats_update(&b->c->btree_read_time, b->io_start_time);
236 spin_unlock(&b->c->btree_read_time_lock);
237
238 smp_wmb(); /* read_done is our write lock */
239 set_btree_node_read_done(b);
240
241 closure_return(cl);
242err:
243 set_btree_node_io_error(b);
244 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
245 err, PTR_BUCKET_NR(b->c, &b->key, 0),
246 index(i, b), i->keys);
247 goto out;
248}
249
250void bch_btree_read(struct btree *b)
251{
252 BUG_ON(b->nsets || b->written);
253
254 if (!closure_trylock(&b->io.cl, &b->c->cl))
255 BUG();
256
257 b->io_start_time = local_clock();
258
259 btree_bio_init(b);
260 b->bio->bi_rw = REQ_META|READ_SYNC;
261 b->bio->bi_size = KEY_SIZE(&b->key) << 9;
262
263 bch_bio_map(b->bio, b->sets[0].data);
264
265 pr_debug("%s", pbtree(b));
266 trace_bcache_btree_read(b->bio);
267 bch_submit_bbio(b->bio, b->c, &b->key, 0);
268
269 continue_at(&b->io.cl, bch_btree_read_done, system_wq);
270}
271
272static void btree_complete_write(struct btree *b, struct btree_write *w)
273{
274 if (w->prio_blocked &&
275 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
276 wake_up(&b->c->alloc_wait);
277
278 if (w->journal) {
279 atomic_dec_bug(w->journal);
280 __closure_wake_up(&b->c->journal.wait);
281 }
282
283 if (w->owner)
284 closure_put(w->owner);
285
286 w->prio_blocked = 0;
287 w->journal = NULL;
288 w->owner = NULL;
289}
290
291static void __btree_write_done(struct closure *cl)
292{
293 struct btree *b = container_of(cl, struct btree, io.cl);
294 struct btree_write *w = btree_prev_write(b);
295
296 bch_bbio_free(b->bio, b->c);
297 b->bio = NULL;
298 btree_complete_write(b, w);
299
300 if (btree_node_dirty(b))
301 queue_delayed_work(btree_io_wq, &b->work,
302 msecs_to_jiffies(30000));
303
304 closure_return(cl);
305}
306
307static void btree_write_done(struct closure *cl)
308{
309 struct btree *b = container_of(cl, struct btree, io.cl);
310 struct bio_vec *bv;
311 int n;
312
313 __bio_for_each_segment(bv, b->bio, n, 0)
314 __free_page(bv->bv_page);
315
316 __btree_write_done(cl);
317}
318
319static void do_btree_write(struct btree *b)
320{
321 struct closure *cl = &b->io.cl;
322 struct bset *i = b->sets[b->nsets].data;
323 BKEY_PADDED(key) k;
324
325 i->version = BCACHE_BSET_VERSION;
326 i->csum = btree_csum_set(b, i);
327
328 btree_bio_init(b);
329 b->bio->bi_rw = REQ_META|WRITE_SYNC;
330 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
331 bch_bio_map(b->bio, i);
332
333 bkey_copy(&k.key, &b->key);
334 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
335
336 if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) {
337 int j;
338 struct bio_vec *bv;
339 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
340
341 bio_for_each_segment(bv, b->bio, j)
342 memcpy(page_address(bv->bv_page),
343 base + j * PAGE_SIZE, PAGE_SIZE);
344
345 trace_bcache_btree_write(b->bio);
346 bch_submit_bbio(b->bio, b->c, &k.key, 0);
347
348 continue_at(cl, btree_write_done, NULL);
349 } else {
350 b->bio->bi_vcnt = 0;
351 bch_bio_map(b->bio, i);
352
353 trace_bcache_btree_write(b->bio);
354 bch_submit_bbio(b->bio, b->c, &k.key, 0);
355
356 closure_sync(cl);
357 __btree_write_done(cl);
358 }
359}
360
361static void __btree_write(struct btree *b)
362{
363 struct bset *i = b->sets[b->nsets].data;
364
365 BUG_ON(current->bio_list);
366
367 closure_lock(&b->io, &b->c->cl);
368 cancel_delayed_work(&b->work);
369
370 clear_bit(BTREE_NODE_dirty, &b->flags);
371 change_bit(BTREE_NODE_write_idx, &b->flags);
372
373 bch_check_key_order(b, i);
374 BUG_ON(b->written && !i->keys);
375
376 do_btree_write(b);
377
378 pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys);
379
380 b->written += set_blocks(i, b->c);
381 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
382 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
383
384 bch_btree_sort_lazy(b);
385
386 if (b->written < btree_blocks(b))
387 bch_bset_init_next(b);
388}
389
390static void btree_write_work(struct work_struct *w)
391{
392 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
393
394 down_write(&b->lock);
395
396 if (btree_node_dirty(b))
397 __btree_write(b);
398 up_write(&b->lock);
399}
400
401void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
402{
403 struct bset *i = b->sets[b->nsets].data;
404 struct btree_write *w = btree_current_write(b);
405
406 BUG_ON(b->written &&
407 (b->written >= btree_blocks(b) ||
408 i->seq != b->sets[0].data->seq ||
409 !i->keys));
410
411 if (!btree_node_dirty(b)) {
412 set_btree_node_dirty(b);
413 queue_delayed_work(btree_io_wq, &b->work,
414 msecs_to_jiffies(30000));
415 }
416
417 w->prio_blocked += b->prio_blocked;
418 b->prio_blocked = 0;
419
420 if (op && op->journal && !b->level) {
421 if (w->journal &&
422 journal_pin_cmp(b->c, w, op)) {
423 atomic_dec_bug(w->journal);
424 w->journal = NULL;
425 }
426
427 if (!w->journal) {
428 w->journal = op->journal;
429 atomic_inc(w->journal);
430 }
431 }
432
433 if (current->bio_list)
434 return;
435
436 /* Force write if set is too big */
437 if (now ||
438 b->level ||
439 set_bytes(i) > PAGE_SIZE - 48) {
440 if (op && now) {
441 /* Must wait on multiple writes */
442 BUG_ON(w->owner);
443 w->owner = &op->cl;
444 closure_get(&op->cl);
445 }
446
447 __btree_write(b);
448 }
449 BUG_ON(!b->written);
450}
451
452/*
453 * Btree in memory cache - allocation/freeing
454 * mca -> memory cache
455 */
456
457static void mca_reinit(struct btree *b)
458{
459 unsigned i;
460
461 b->flags = 0;
462 b->written = 0;
463 b->nsets = 0;
464
465 for (i = 0; i < MAX_BSETS; i++)
466 b->sets[i].size = 0;
467 /*
468 * Second loop starts at 1 because b->sets[0]->data is the memory we
469 * allocated
470 */
471 for (i = 1; i < MAX_BSETS; i++)
472 b->sets[i].data = NULL;
473}
474
475#define mca_reserve(c) (((c->root && c->root->level) \
476 ? c->root->level : 1) * 8 + 16)
477#define mca_can_free(c) \
478 max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
479
480static void mca_data_free(struct btree *b)
481{
482 struct bset_tree *t = b->sets;
483 BUG_ON(!closure_is_unlocked(&b->io.cl));
484
485 if (bset_prev_bytes(b) < PAGE_SIZE)
486 kfree(t->prev);
487 else
488 free_pages((unsigned long) t->prev,
489 get_order(bset_prev_bytes(b)));
490
491 if (bset_tree_bytes(b) < PAGE_SIZE)
492 kfree(t->tree);
493 else
494 free_pages((unsigned long) t->tree,
495 get_order(bset_tree_bytes(b)));
496
497 free_pages((unsigned long) t->data, b->page_order);
498
499 t->prev = NULL;
500 t->tree = NULL;
501 t->data = NULL;
502 list_move(&b->list, &b->c->btree_cache_freed);
503 b->c->bucket_cache_used--;
504}
505
506static void mca_bucket_free(struct btree *b)
507{
508 BUG_ON(btree_node_dirty(b));
509
510 b->key.ptr[0] = 0;
511 hlist_del_init_rcu(&b->hash);
512 list_move(&b->list, &b->c->btree_cache_freeable);
513}
514
515static unsigned btree_order(struct bkey *k)
516{
517 return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
518}
519
520static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
521{
522 struct bset_tree *t = b->sets;
523 BUG_ON(t->data);
524
525 b->page_order = max_t(unsigned,
526 ilog2(b->c->btree_pages),
527 btree_order(k));
528
529 t->data = (void *) __get_free_pages(gfp, b->page_order);
530 if (!t->data)
531 goto err;
532
533 t->tree = bset_tree_bytes(b) < PAGE_SIZE
534 ? kmalloc(bset_tree_bytes(b), gfp)
535 : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
536 if (!t->tree)
537 goto err;
538
539 t->prev = bset_prev_bytes(b) < PAGE_SIZE
540 ? kmalloc(bset_prev_bytes(b), gfp)
541 : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
542 if (!t->prev)
543 goto err;
544
545 list_move(&b->list, &b->c->btree_cache);
546 b->c->bucket_cache_used++;
547 return;
548err:
549 mca_data_free(b);
550}
551
552static struct btree *mca_bucket_alloc(struct cache_set *c,
553 struct bkey *k, gfp_t gfp)
554{
555 struct btree *b = kzalloc(sizeof(struct btree), gfp);
556 if (!b)
557 return NULL;
558
559 init_rwsem(&b->lock);
560 lockdep_set_novalidate_class(&b->lock);
561 INIT_LIST_HEAD(&b->list);
562 INIT_DELAYED_WORK(&b->work, btree_write_work);
563 b->c = c;
564 closure_init_unlocked(&b->io);
565
566 mca_data_alloc(b, k, gfp);
567 return b;
568}
569
570static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
571{
572 lockdep_assert_held(&b->c->bucket_lock);
573
574 if (!down_write_trylock(&b->lock))
575 return -ENOMEM;
576
577 if (b->page_order < min_order) {
578 rw_unlock(true, b);
579 return -ENOMEM;
580 }
581
582 BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
583
584 if (cl && btree_node_dirty(b))
585 bch_btree_write(b, true, NULL);
586
587 if (cl)
588 closure_wait_event_async(&b->io.wait, cl,
589 atomic_read(&b->io.cl.remaining) == -1);
590
591 if (btree_node_dirty(b) ||
592 !closure_is_unlocked(&b->io.cl) ||
593 work_pending(&b->work.work)) {
594 rw_unlock(true, b);
595 return -EAGAIN;
596 }
597
598 return 0;
599}
600
601static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
602{
603 struct cache_set *c = container_of(shrink, struct cache_set, shrink);
604 struct btree *b, *t;
605 unsigned long i, nr = sc->nr_to_scan;
606
607 if (c->shrinker_disabled)
608 return 0;
609
610 if (c->try_harder)
611 return 0;
612
613 /*
614 * If nr == 0, we're supposed to return the number of items we have
615 * cached. Not allowed to return -1.
616 */
617 if (!nr)
618 return mca_can_free(c) * c->btree_pages;
619
620 /* Return -1 if we can't do anything right now */
621 if (sc->gfp_mask & __GFP_WAIT)
622 mutex_lock(&c->bucket_lock);
623 else if (!mutex_trylock(&c->bucket_lock))
624 return -1;
625
626 nr /= c->btree_pages;
627 nr = min_t(unsigned long, nr, mca_can_free(c));
628
629 i = 0;
630 list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
631 if (!nr)
632 break;
633
634 if (++i > 3 &&
635 !mca_reap(b, NULL, 0)) {
636 mca_data_free(b);
637 rw_unlock(true, b);
638 --nr;
639 }
640 }
641
642 /*
643 * Can happen right when we first start up, before we've read in any
644 * btree nodes
645 */
646 if (list_empty(&c->btree_cache))
647 goto out;
648
649 for (i = 0; nr && i < c->bucket_cache_used; i++) {
650 b = list_first_entry(&c->btree_cache, struct btree, list);
651 list_rotate_left(&c->btree_cache);
652
653 if (!b->accessed &&
654 !mca_reap(b, NULL, 0)) {
655 mca_bucket_free(b);
656 mca_data_free(b);
657 rw_unlock(true, b);
658 --nr;
659 } else
660 b->accessed = 0;
661 }
662out:
663 nr = mca_can_free(c) * c->btree_pages;
664 mutex_unlock(&c->bucket_lock);
665 return nr;
666}
667
668void bch_btree_cache_free(struct cache_set *c)
669{
670 struct btree *b;
671 struct closure cl;
672 closure_init_stack(&cl);
673
674 if (c->shrink.list.next)
675 unregister_shrinker(&c->shrink);
676
677 mutex_lock(&c->bucket_lock);
678
679#ifdef CONFIG_BCACHE_DEBUG
680 if (c->verify_data)
681 list_move(&c->verify_data->list, &c->btree_cache);
682#endif
683
684 list_splice(&c->btree_cache_freeable,
685 &c->btree_cache);
686
687 while (!list_empty(&c->btree_cache)) {
688 b = list_first_entry(&c->btree_cache, struct btree, list);
689
690 if (btree_node_dirty(b))
691 btree_complete_write(b, btree_current_write(b));
692 clear_bit(BTREE_NODE_dirty, &b->flags);
693
694 mca_data_free(b);
695 }
696
697 while (!list_empty(&c->btree_cache_freed)) {
698 b = list_first_entry(&c->btree_cache_freed,
699 struct btree, list);
700 list_del(&b->list);
701 cancel_delayed_work_sync(&b->work);
702 kfree(b);
703 }
704
705 mutex_unlock(&c->bucket_lock);
706}
707
708int bch_btree_cache_alloc(struct cache_set *c)
709{
710 unsigned i;
711
712 /* XXX: doesn't check for errors */
713
714 closure_init_unlocked(&c->gc);
715
716 for (i = 0; i < mca_reserve(c); i++)
717 mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
718
719 list_splice_init(&c->btree_cache,
720 &c->btree_cache_freeable);
721
722#ifdef CONFIG_BCACHE_DEBUG
723 mutex_init(&c->verify_lock);
724
725 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
726
727 if (c->verify_data &&
728 c->verify_data->sets[0].data)
729 list_del_init(&c->verify_data->list);
730 else
731 c->verify_data = NULL;
732#endif
733
734 c->shrink.shrink = bch_mca_shrink;
735 c->shrink.seeks = 4;
736 c->shrink.batch = c->btree_pages * 2;
737 register_shrinker(&c->shrink);
738
739 return 0;
740}
741
742/* Btree in memory cache - hash table */
743
744static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
745{
746 return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
747}
748
749static struct btree *mca_find(struct cache_set *c, struct bkey *k)
750{
751 struct btree *b;
752
753 rcu_read_lock();
754 hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
755 if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
756 goto out;
757 b = NULL;
758out:
759 rcu_read_unlock();
760 return b;
761}
762
763static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
764 int level, struct closure *cl)
765{
766 int ret = -ENOMEM;
767 struct btree *i;
768
769 if (!cl)
770 return ERR_PTR(-ENOMEM);
771
772 /*
773 * Trying to free up some memory - i.e. reuse some btree nodes - may
774 * require initiating IO to flush the dirty part of the node. If we're
775 * running under generic_make_request(), that IO will never finish and
776 * we would deadlock. Returning -EAGAIN causes the cache lookup code to
777 * punt to workqueue and retry.
778 */
779 if (current->bio_list)
780 return ERR_PTR(-EAGAIN);
781
782 if (c->try_harder && c->try_harder != cl) {
783 closure_wait_event_async(&c->try_wait, cl, !c->try_harder);
784 return ERR_PTR(-EAGAIN);
785 }
786
787 /* XXX: tracepoint */
788 c->try_harder = cl;
789 c->try_harder_start = local_clock();
790retry:
791 list_for_each_entry_reverse(i, &c->btree_cache, list) {
792 int r = mca_reap(i, cl, btree_order(k));
793 if (!r)
794 return i;
795 if (r != -ENOMEM)
796 ret = r;
797 }
798
799 if (ret == -EAGAIN &&
800 closure_blocking(cl)) {
801 mutex_unlock(&c->bucket_lock);
802 closure_sync(cl);
803 mutex_lock(&c->bucket_lock);
804 goto retry;
805 }
806
807 return ERR_PTR(ret);
808}
809
810/*
811 * We can only have one thread cannibalizing other cached btree nodes at a time,
812 * or we'll deadlock. We use an open coded mutex to ensure that, which a
813 * cannibalize_bucket() will take. This means every time we unlock the root of
814 * the btree, we need to release this lock if we have it held.
815 */
816void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl)
817{
818 if (c->try_harder == cl) {
819 bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
820 c->try_harder = NULL;
821 __closure_wake_up(&c->try_wait);
822 }
823}
824
825static struct btree *mca_alloc(struct cache_set *c, struct bkey *k,
826 int level, struct closure *cl)
827{
828 struct btree *b;
829
830 lockdep_assert_held(&c->bucket_lock);
831
832 if (mca_find(c, k))
833 return NULL;
834
835 /* btree_free() doesn't free memory; it sticks the node on the end of
836 * the list. Check if there's any freed nodes there:
837 */
838 list_for_each_entry(b, &c->btree_cache_freeable, list)
839 if (!mca_reap(b, NULL, btree_order(k)))
840 goto out;
841
842 /* We never free struct btree itself, just the memory that holds the on
843 * disk node. Check the freed list before allocating a new one:
844 */
845 list_for_each_entry(b, &c->btree_cache_freed, list)
846 if (!mca_reap(b, NULL, 0)) {
847 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
848 if (!b->sets[0].data)
849 goto err;
850 else
851 goto out;
852 }
853
854 b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
855 if (!b)
856 goto err;
857
858 BUG_ON(!down_write_trylock(&b->lock));
859 if (!b->sets->data)
860 goto err;
861out:
862 BUG_ON(!closure_is_unlocked(&b->io.cl));
863
864 bkey_copy(&b->key, k);
865 list_move(&b->list, &c->btree_cache);
866 hlist_del_init_rcu(&b->hash);
867 hlist_add_head_rcu(&b->hash, mca_hash(c, k));
868
869 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
870 b->level = level;
871
872 mca_reinit(b);
873
874 return b;
875err:
876 if (b)
877 rw_unlock(true, b);
878
879 b = mca_cannibalize(c, k, level, cl);
880 if (!IS_ERR(b))
881 goto out;
882
883 return b;
884}
885
886/**
887 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
888 * in from disk if necessary.
889 *
890 * If IO is necessary, it uses the closure embedded in struct btree_op to wait;
891 * if that closure is in non blocking mode, will return -EAGAIN.
892 *
893 * The btree node will have either a read or a write lock held, depending on
894 * level and op->lock.
895 */
896struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
897 int level, struct btree_op *op)
898{
899 int i = 0;
900 bool write = level <= op->lock;
901 struct btree *b;
902
903 BUG_ON(level < 0);
904retry:
905 b = mca_find(c, k);
906
907 if (!b) {
908 mutex_lock(&c->bucket_lock);
909 b = mca_alloc(c, k, level, &op->cl);
910 mutex_unlock(&c->bucket_lock);
911
912 if (!b)
913 goto retry;
914 if (IS_ERR(b))
915 return b;
916
917 bch_btree_read(b);
918
919 if (!write)
920 downgrade_write(&b->lock);
921 } else {
922 rw_lock(write, b, level);
923 if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
924 rw_unlock(write, b);
925 goto retry;
926 }
927 BUG_ON(b->level != level);
928 }
929
930 b->accessed = 1;
931
932 for (; i <= b->nsets && b->sets[i].size; i++) {
933 prefetch(b->sets[i].tree);
934 prefetch(b->sets[i].data);
935 }
936
937 for (; i <= b->nsets; i++)
938 prefetch(b->sets[i].data);
939
940 if (!closure_wait_event(&b->io.wait, &op->cl,
941 btree_node_read_done(b))) {
942 rw_unlock(write, b);
943 b = ERR_PTR(-EAGAIN);
944 } else if (btree_node_io_error(b)) {
945 rw_unlock(write, b);
946 b = ERR_PTR(-EIO);
947 } else
948 BUG_ON(!b->written);
949
950 return b;
951}
952
953static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
954{
955 struct btree *b;
956
957 mutex_lock(&c->bucket_lock);
958 b = mca_alloc(c, k, level, NULL);
959 mutex_unlock(&c->bucket_lock);
960
961 if (!IS_ERR_OR_NULL(b)) {
962 bch_btree_read(b);
963 rw_unlock(true, b);
964 }
965}
966
967/* Btree alloc */
968
969static void btree_node_free(struct btree *b, struct btree_op *op)
970{
971 unsigned i;
972
973 /*
974 * The BUG_ON() in btree_node_get() implies that we must have a write
975 * lock on parent to free or even invalidate a node
976 */
977 BUG_ON(op->lock <= b->level);
978 BUG_ON(b == b->c->root);
979 pr_debug("bucket %s", pbtree(b));
980
981 if (btree_node_dirty(b))
982 btree_complete_write(b, btree_current_write(b));
983 clear_bit(BTREE_NODE_dirty, &b->flags);
984
985 if (b->prio_blocked &&
986 !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
987 wake_up(&b->c->alloc_wait);
988
989 b->prio_blocked = 0;
990
991 cancel_delayed_work(&b->work);
992
993 mutex_lock(&b->c->bucket_lock);
994
995 for (i = 0; i < KEY_PTRS(&b->key); i++) {
996 BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
997
998 bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
999 PTR_BUCKET(b->c, &b->key, i));
1000 }
1001
1002 bch_bucket_free(b->c, &b->key);
1003 mca_bucket_free(b);
1004 mutex_unlock(&b->c->bucket_lock);
1005}
1006
1007struct btree *bch_btree_node_alloc(struct cache_set *c, int level,
1008 struct closure *cl)
1009{
1010 BKEY_PADDED(key) k;
1011 struct btree *b = ERR_PTR(-EAGAIN);
1012
1013 mutex_lock(&c->bucket_lock);
1014retry:
1015 if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl))
1016 goto err;
1017
1018 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1019
1020 b = mca_alloc(c, &k.key, level, cl);
1021 if (IS_ERR(b))
1022 goto err_free;
1023
1024 if (!b) {
1025 cache_bug(c,
1026 "Tried to allocate bucket that was in btree cache");
1027 __bkey_put(c, &k.key);
1028 goto retry;
1029 }
1030
1031 set_btree_node_read_done(b);
1032 b->accessed = 1;
1033 bch_bset_init_next(b);
1034
1035 mutex_unlock(&c->bucket_lock);
1036 return b;
1037err_free:
1038 bch_bucket_free(c, &k.key);
1039 __bkey_put(c, &k.key);
1040err:
1041 mutex_unlock(&c->bucket_lock);
1042 return b;
1043}
1044
1045static struct btree *btree_node_alloc_replacement(struct btree *b,
1046 struct closure *cl)
1047{
1048 struct btree *n = bch_btree_node_alloc(b->c, b->level, cl);
1049 if (!IS_ERR_OR_NULL(n))
1050 bch_btree_sort_into(b, n);
1051
1052 return n;
1053}
1054
1055/* Garbage collection */
1056
1057uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1058{
1059 uint8_t stale = 0;
1060 unsigned i;
1061 struct bucket *g;
1062
1063 /*
1064 * ptr_invalid() can't return true for the keys that mark btree nodes as
1065 * freed, but since ptr_bad() returns true we'll never actually use them
1066 * for anything and thus we don't want mark their pointers here
1067 */
1068 if (!bkey_cmp(k, &ZERO_KEY))
1069 return stale;
1070
1071 for (i = 0; i < KEY_PTRS(k); i++) {
1072 if (!ptr_available(c, k, i))
1073 continue;
1074
1075 g = PTR_BUCKET(c, k, i);
1076
1077 if (gen_after(g->gc_gen, PTR_GEN(k, i)))
1078 g->gc_gen = PTR_GEN(k, i);
1079
1080 if (ptr_stale(c, k, i)) {
1081 stale = max(stale, ptr_stale(c, k, i));
1082 continue;
1083 }
1084
1085 cache_bug_on(GC_MARK(g) &&
1086 (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
1087 c, "inconsistent ptrs: mark = %llu, level = %i",
1088 GC_MARK(g), level);
1089
1090 if (level)
1091 SET_GC_MARK(g, GC_MARK_METADATA);
1092 else if (KEY_DIRTY(k))
1093 SET_GC_MARK(g, GC_MARK_DIRTY);
1094
1095 /* guard against overflow */
1096 SET_GC_SECTORS_USED(g, min_t(unsigned,
1097 GC_SECTORS_USED(g) + KEY_SIZE(k),
1098 (1 << 14) - 1));
1099
1100 BUG_ON(!GC_SECTORS_USED(g));
1101 }
1102
1103 return stale;
1104}
1105
1106#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
1107
1108static int btree_gc_mark_node(struct btree *b, unsigned *keys,
1109 struct gc_stat *gc)
1110{
1111 uint8_t stale = 0;
1112 unsigned last_dev = -1;
1113 struct bcache_device *d = NULL;
1114 struct bkey *k;
1115 struct btree_iter iter;
1116 struct bset_tree *t;
1117
1118 gc->nodes++;
1119
1120 for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1121 if (last_dev != KEY_INODE(k)) {
1122 last_dev = KEY_INODE(k);
1123
1124 d = KEY_INODE(k) < b->c->nr_uuids
1125 ? b->c->devices[last_dev]
1126 : NULL;
1127 }
1128
1129 stale = max(stale, btree_mark_key(b, k));
1130
1131 if (bch_ptr_bad(b, k))
1132 continue;
1133
1134 *keys += bkey_u64s(k);
1135
1136 gc->key_bytes += bkey_u64s(k);
1137 gc->nkeys++;
1138
1139 gc->data += KEY_SIZE(k);
1140 if (KEY_DIRTY(k)) {
1141 gc->dirty += KEY_SIZE(k);
1142 if (d)
1143 d->sectors_dirty_gc += KEY_SIZE(k);
1144 }
1145 }
1146
1147 for (t = b->sets; t <= &b->sets[b->nsets]; t++)
1148 btree_bug_on(t->size &&
1149 bset_written(b, t) &&
1150 bkey_cmp(&b->key, &t->end) < 0,
1151 b, "found short btree key in gc");
1152
1153 return stale;
1154}
1155
1156static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
1157 struct btree_op *op)
1158{
1159 /*
1160 * We block priorities from being written for the duration of garbage
1161 * collection, so we can't sleep in btree_alloc() ->
1162 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
1163 * our closure.
1164 */
1165 struct btree *n = btree_node_alloc_replacement(b, NULL);
1166
1167 if (!IS_ERR_OR_NULL(n)) {
1168 swap(b, n);
1169
1170 memcpy(k->ptr, b->key.ptr,
1171 sizeof(uint64_t) * KEY_PTRS(&b->key));
1172
1173 __bkey_put(b->c, &b->key);
1174 atomic_inc(&b->c->prio_blocked);
1175 b->prio_blocked++;
1176
1177 btree_node_free(n, op);
1178 up_write(&n->lock);
1179 }
1180
1181 return b;
1182}
1183
1184/*
1185 * Leaving this at 2 until we've got incremental garbage collection done; it
1186 * could be higher (and has been tested with 4) except that garbage collection
1187 * could take much longer, adversely affecting latency.
1188 */
1189#define GC_MERGE_NODES 2U
1190
1191struct gc_merge_info {
1192 struct btree *b;
1193 struct bkey *k;
1194 unsigned keys;
1195};
1196
1197static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
1198 struct gc_stat *gc, struct gc_merge_info *r)
1199{
1200 unsigned nodes = 0, keys = 0, blocks;
1201 int i;
1202
1203 while (nodes < GC_MERGE_NODES && r[nodes].b)
1204 keys += r[nodes++].keys;
1205
1206 blocks = btree_default_blocks(b->c) * 2 / 3;
1207
1208 if (nodes < 2 ||
1209 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
1210 return;
1211
1212 for (i = nodes - 1; i >= 0; --i) {
1213 if (r[i].b->written)
1214 r[i].b = btree_gc_alloc(r[i].b, r[i].k, op);
1215
1216 if (r[i].b->written)
1217 return;
1218 }
1219
1220 for (i = nodes - 1; i > 0; --i) {
1221 struct bset *n1 = r[i].b->sets->data;
1222 struct bset *n2 = r[i - 1].b->sets->data;
1223 struct bkey *k, *last = NULL;
1224
1225 keys = 0;
1226
1227 if (i == 1) {
1228 /*
1229 * Last node we're not getting rid of - we're getting
1230 * rid of the node at r[0]. Have to try and fit all of
1231 * the remaining keys into this node; we can't ensure
1232 * they will always fit due to rounding and variable
1233 * length keys (shouldn't be possible in practice,
1234 * though)
1235 */
1236 if (__set_blocks(n1, n1->keys + r->keys,
1237 b->c) > btree_blocks(r[i].b))
1238 return;
1239
1240 keys = n2->keys;
1241 last = &r->b->key;
1242 } else
1243 for (k = n2->start;
1244 k < end(n2);
1245 k = bkey_next(k)) {
1246 if (__set_blocks(n1, n1->keys + keys +
1247 bkey_u64s(k), b->c) > blocks)
1248 break;
1249
1250 last = k;
1251 keys += bkey_u64s(k);
1252 }
1253
1254 BUG_ON(__set_blocks(n1, n1->keys + keys,
1255 b->c) > btree_blocks(r[i].b));
1256
1257 if (last) {
1258 bkey_copy_key(&r[i].b->key, last);
1259 bkey_copy_key(r[i].k, last);
1260 }
1261
1262 memcpy(end(n1),
1263 n2->start,
1264 (void *) node(n2, keys) - (void *) n2->start);
1265
1266 n1->keys += keys;
1267
1268 memmove(n2->start,
1269 node(n2, keys),
1270 (void *) end(n2) - (void *) node(n2, keys));
1271
1272 n2->keys -= keys;
1273
1274 r[i].keys = n1->keys;
1275 r[i - 1].keys = n2->keys;
1276 }
1277
1278 btree_node_free(r->b, op);
1279 up_write(&r->b->lock);
1280
1281 pr_debug("coalesced %u nodes", nodes);
1282
1283 gc->nodes--;
1284 nodes--;
1285
1286 memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes);
1287 memset(&r[nodes], 0, sizeof(struct gc_merge_info));
1288}
1289
1290static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1291 struct closure *writes, struct gc_stat *gc)
1292{
1293 void write(struct btree *r)
1294 {
1295 if (!r->written)
1296 bch_btree_write(r, true, op);
1297 else if (btree_node_dirty(r)) {
1298 BUG_ON(btree_current_write(r)->owner);
1299 btree_current_write(r)->owner = writes;
1300 closure_get(writes);
1301
1302 bch_btree_write(r, true, NULL);
1303 }
1304
1305 up_write(&r->lock);
1306 }
1307
1308 int ret = 0, stale;
1309 unsigned i;
1310 struct gc_merge_info r[GC_MERGE_NODES];
1311
1312 memset(r, 0, sizeof(r));
1313
1314 while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) {
1315 r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op);
1316
1317 if (IS_ERR(r->b)) {
1318 ret = PTR_ERR(r->b);
1319 break;
1320 }
1321
1322 r->keys = 0;
1323 stale = btree_gc_mark_node(r->b, &r->keys, gc);
1324
1325 if (!b->written &&
1326 (r->b->level || stale > 10 ||
1327 b->c->gc_always_rewrite))
1328 r->b = btree_gc_alloc(r->b, r->k, op);
1329
1330 if (r->b->level)
1331 ret = btree_gc_recurse(r->b, op, writes, gc);
1332
1333 if (ret) {
1334 write(r->b);
1335 break;
1336 }
1337
1338 bkey_copy_key(&b->c->gc_done, r->k);
1339
1340 if (!b->written)
1341 btree_gc_coalesce(b, op, gc, r);
1342
1343 if (r[GC_MERGE_NODES - 1].b)
1344 write(r[GC_MERGE_NODES - 1].b);
1345
1346 memmove(&r[1], &r[0],
1347 sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1));
1348
1349 /* When we've got incremental GC working, we'll want to do
1350 * if (should_resched())
1351 * return -EAGAIN;
1352 */
1353 cond_resched();
1354#if 0
1355 if (need_resched()) {
1356 ret = -EAGAIN;
1357 break;
1358 }
1359#endif
1360 }
1361
1362 for (i = 1; i < GC_MERGE_NODES && r[i].b; i++)
1363 write(r[i].b);
1364
1365 /* Might have freed some children, must remove their keys */
1366 if (!b->written)
1367 bch_btree_sort(b);
1368
1369 return ret;
1370}
1371
1372static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1373 struct closure *writes, struct gc_stat *gc)
1374{
1375 struct btree *n = NULL;
1376 unsigned keys = 0;
1377 int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
1378
1379 if (b->level || stale > 10)
1380 n = btree_node_alloc_replacement(b, NULL);
1381
1382 if (!IS_ERR_OR_NULL(n))
1383 swap(b, n);
1384
1385 if (b->level)
1386 ret = btree_gc_recurse(b, op, writes, gc);
1387
1388 if (!b->written || btree_node_dirty(b)) {
1389 atomic_inc(&b->c->prio_blocked);
1390 b->prio_blocked++;
1391 bch_btree_write(b, true, n ? op : NULL);
1392 }
1393
1394 if (!IS_ERR_OR_NULL(n)) {
1395 closure_sync(&op->cl);
1396 bch_btree_set_root(b);
1397 btree_node_free(n, op);
1398 rw_unlock(true, b);
1399 }
1400
1401 return ret;
1402}
1403
1404static void btree_gc_start(struct cache_set *c)
1405{
1406 struct cache *ca;
1407 struct bucket *b;
1408 struct bcache_device **d;
1409 unsigned i;
1410
1411 if (!c->gc_mark_valid)
1412 return;
1413
1414 mutex_lock(&c->bucket_lock);
1415
1416 c->gc_mark_valid = 0;
1417 c->gc_done = ZERO_KEY;
1418
1419 for_each_cache(ca, c, i)
1420 for_each_bucket(b, ca) {
1421 b->gc_gen = b->gen;
1422 if (!atomic_read(&b->pin))
1423 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
1424 }
1425
1426 for (d = c->devices;
1427 d < c->devices + c->nr_uuids;
1428 d++)
1429 if (*d)
1430 (*d)->sectors_dirty_gc = 0;
1431
1432 mutex_unlock(&c->bucket_lock);
1433}
1434
1435size_t bch_btree_gc_finish(struct cache_set *c)
1436{
1437 size_t available = 0;
1438 struct bucket *b;
1439 struct cache *ca;
1440 struct bcache_device **d;
1441 unsigned i;
1442
1443 mutex_lock(&c->bucket_lock);
1444
1445 set_gc_sectors(c);
1446 c->gc_mark_valid = 1;
1447 c->need_gc = 0;
1448
1449 if (c->root)
1450 for (i = 0; i < KEY_PTRS(&c->root->key); i++)
1451 SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
1452 GC_MARK_METADATA);
1453
1454 for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
1455 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
1456 GC_MARK_METADATA);
1457
1458 for_each_cache(ca, c, i) {
1459 uint64_t *i;
1460
1461 ca->invalidate_needs_gc = 0;
1462
1463 for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
1464 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1465
1466 for (i = ca->prio_buckets;
1467 i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
1468 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1469
1470 for_each_bucket(b, ca) {
1471 b->last_gc = b->gc_gen;
1472 c->need_gc = max(c->need_gc, bucket_gc_gen(b));
1473
1474 if (!atomic_read(&b->pin) &&
1475 GC_MARK(b) == GC_MARK_RECLAIMABLE) {
1476 available++;
1477 if (!GC_SECTORS_USED(b))
1478 bch_bucket_add_unused(ca, b);
1479 }
1480 }
1481 }
1482
1483 for (d = c->devices;
1484 d < c->devices + c->nr_uuids;
1485 d++)
1486 if (*d) {
1487 unsigned long last =
1488 atomic_long_read(&((*d)->sectors_dirty));
1489 long difference = (*d)->sectors_dirty_gc - last;
1490
1491 pr_debug("sectors dirty off by %li", difference);
1492
1493 (*d)->sectors_dirty_last += difference;
1494
1495 atomic_long_set(&((*d)->sectors_dirty),
1496 (*d)->sectors_dirty_gc);
1497 }
1498
1499 mutex_unlock(&c->bucket_lock);
1500 return available;
1501}
1502
1503static void bch_btree_gc(struct closure *cl)
1504{
1505 struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
1506 int ret;
1507 unsigned long available;
1508 struct gc_stat stats;
1509 struct closure writes;
1510 struct btree_op op;
1511
1512 uint64_t start_time = local_clock();
1513 trace_bcache_gc_start(c->sb.set_uuid);
1514 blktrace_msg_all(c, "Starting gc");
1515
1516 memset(&stats, 0, sizeof(struct gc_stat));
1517 closure_init_stack(&writes);
1518 bch_btree_op_init_stack(&op);
1519 op.lock = SHRT_MAX;
1520
1521 btree_gc_start(c);
1522
1523 ret = btree_root(gc_root, c, &op, &writes, &stats);
1524 closure_sync(&op.cl);
1525 closure_sync(&writes);
1526
1527 if (ret) {
1528 blktrace_msg_all(c, "Stopped gc");
1529 pr_warn("gc failed!");
1530
1531 continue_at(cl, bch_btree_gc, bch_gc_wq);
1532 }
1533
1534 /* Possibly wait for new UUIDs or whatever to hit disk */
1535 bch_journal_meta(c, &op.cl);
1536 closure_sync(&op.cl);
1537
1538 available = bch_btree_gc_finish(c);
1539
1540 bch_time_stats_update(&c->btree_gc_time, start_time);
1541
1542 stats.key_bytes *= sizeof(uint64_t);
1543 stats.dirty <<= 9;
1544 stats.data <<= 9;
1545 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
1546 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1547 blktrace_msg_all(c, "Finished gc");
1548
1549 trace_bcache_gc_end(c->sb.set_uuid);
1550 wake_up(&c->alloc_wait);
1551
1552 continue_at(cl, bch_moving_gc, bch_gc_wq);
1553}
1554
1555void bch_queue_gc(struct cache_set *c)
1556{
1557 closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl);
1558}
1559
1560/* Initial partial gc */
1561
1562static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1563 unsigned long **seen)
1564{
1565 int ret;
1566 unsigned i;
1567 struct bkey *k;
1568 struct bucket *g;
1569 struct btree_iter iter;
1570
1571 for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1572 for (i = 0; i < KEY_PTRS(k); i++) {
1573 if (!ptr_available(b->c, k, i))
1574 continue;
1575
1576 g = PTR_BUCKET(b->c, k, i);
1577
1578 if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
1579 seen[PTR_DEV(k, i)]) ||
1580 !ptr_stale(b->c, k, i)) {
1581 g->gen = PTR_GEN(k, i);
1582
1583 if (b->level)
1584 g->prio = BTREE_PRIO;
1585 else if (g->prio == BTREE_PRIO)
1586 g->prio = INITIAL_PRIO;
1587 }
1588 }
1589
1590 btree_mark_key(b, k);
1591 }
1592
1593 if (b->level) {
1594 k = bch_next_recurse_key(b, &ZERO_KEY);
1595
1596 while (k) {
1597 struct bkey *p = bch_next_recurse_key(b, k);
1598 if (p)
1599 btree_node_prefetch(b->c, p, b->level - 1);
1600
1601 ret = btree(check_recurse, k, b, op, seen);
1602 if (ret)
1603 return ret;
1604
1605 k = p;
1606 }
1607 }
1608
1609 return 0;
1610}
1611
1612int bch_btree_check(struct cache_set *c, struct btree_op *op)
1613{
1614 int ret = -ENOMEM;
1615 unsigned i;
1616 unsigned long *seen[MAX_CACHES_PER_SET];
1617
1618 memset(seen, 0, sizeof(seen));
1619
1620 for (i = 0; c->cache[i]; i++) {
1621 size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
1622 seen[i] = kmalloc(n, GFP_KERNEL);
1623 if (!seen[i])
1624 goto err;
1625
1626 /* Disables the seen array until prio_read() uses it too */
1627 memset(seen[i], 0xFF, n);
1628 }
1629
1630 ret = btree_root(check_recurse, c, op, seen);
1631err:
1632 for (i = 0; i < MAX_CACHES_PER_SET; i++)
1633 kfree(seen[i]);
1634 return ret;
1635}
1636
1637/* Btree insertion */
1638
1639static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
1640{
1641 struct bset *i = b->sets[b->nsets].data;
1642
1643 memmove((uint64_t *) where + bkey_u64s(insert),
1644 where,
1645 (void *) end(i) - (void *) where);
1646
1647 i->keys += bkey_u64s(insert);
1648 bkey_copy(where, insert);
1649 bch_bset_fix_lookup_table(b, where);
1650}
1651
1652static bool fix_overlapping_extents(struct btree *b,
1653 struct bkey *insert,
1654 struct btree_iter *iter,
1655 struct btree_op *op)
1656{
1657 void subtract_dirty(struct bkey *k, int sectors)
1658 {
1659 struct bcache_device *d = b->c->devices[KEY_INODE(k)];
1660
1661 if (KEY_DIRTY(k) && d)
1662 atomic_long_sub(sectors, &d->sectors_dirty);
1663 }
1664
1665 unsigned old_size, sectors_found = 0;
1666
1667 while (1) {
1668 struct bkey *k = bch_btree_iter_next(iter);
1669 if (!k ||
1670 bkey_cmp(&START_KEY(k), insert) >= 0)
1671 break;
1672
1673 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
1674 continue;
1675
1676 old_size = KEY_SIZE(k);
1677
1678 /*
1679 * We might overlap with 0 size extents; we can't skip these
1680 * because if they're in the set we're inserting to we have to
1681 * adjust them so they don't overlap with the key we're
1682 * inserting. But we don't want to check them for BTREE_REPLACE
1683 * operations.
1684 */
1685
1686 if (op->type == BTREE_REPLACE &&
1687 KEY_SIZE(k)) {
1688 /*
1689 * k might have been split since we inserted/found the
1690 * key we're replacing
1691 */
1692 unsigned i;
1693 uint64_t offset = KEY_START(k) -
1694 KEY_START(&op->replace);
1695
1696 /* But it must be a subset of the replace key */
1697 if (KEY_START(k) < KEY_START(&op->replace) ||
1698 KEY_OFFSET(k) > KEY_OFFSET(&op->replace))
1699 goto check_failed;
1700
1701 /* We didn't find a key that we were supposed to */
1702 if (KEY_START(k) > KEY_START(insert) + sectors_found)
1703 goto check_failed;
1704
1705 if (KEY_PTRS(&op->replace) != KEY_PTRS(k))
1706 goto check_failed;
1707
1708 /* skip past gen */
1709 offset <<= 8;
1710
1711 BUG_ON(!KEY_PTRS(&op->replace));
1712
1713 for (i = 0; i < KEY_PTRS(&op->replace); i++)
1714 if (k->ptr[i] != op->replace.ptr[i] + offset)
1715 goto check_failed;
1716
1717 sectors_found = KEY_OFFSET(k) - KEY_START(insert);
1718 }
1719
1720 if (bkey_cmp(insert, k) < 0 &&
1721 bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
1722 /*
1723 * We overlapped in the middle of an existing key: that
1724 * means we have to split the old key. But we have to do
1725 * slightly different things depending on whether the
1726 * old key has been written out yet.
1727 */
1728
1729 struct bkey *top;
1730
1731 subtract_dirty(k, KEY_SIZE(insert));
1732
1733 if (bkey_written(b, k)) {
1734 /*
1735 * We insert a new key to cover the top of the
1736 * old key, and the old key is modified in place
1737 * to represent the bottom split.
1738 *
1739 * It's completely arbitrary whether the new key
1740 * is the top or the bottom, but it has to match
1741 * up with what btree_sort_fixup() does - it
1742 * doesn't check for this kind of overlap, it
1743 * depends on us inserting a new key for the top
1744 * here.
1745 */
1746 top = bch_bset_search(b, &b->sets[b->nsets],
1747 insert);
1748 shift_keys(b, top, k);
1749 } else {
1750 BKEY_PADDED(key) temp;
1751 bkey_copy(&temp.key, k);
1752 shift_keys(b, k, &temp.key);
1753 top = bkey_next(k);
1754 }
1755
1756 bch_cut_front(insert, top);
1757 bch_cut_back(&START_KEY(insert), k);
1758 bch_bset_fix_invalidated_key(b, k);
1759 return false;
1760 }
1761
1762 if (bkey_cmp(insert, k) < 0) {
1763 bch_cut_front(insert, k);
1764 } else {
1765 if (bkey_written(b, k) &&
1766 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
1767 /*
1768 * Completely overwrote, so we don't have to
1769 * invalidate the binary search tree
1770 */
1771 bch_cut_front(k, k);
1772 } else {
1773 __bch_cut_back(&START_KEY(insert), k);
1774 bch_bset_fix_invalidated_key(b, k);
1775 }
1776 }
1777
1778 subtract_dirty(k, old_size - KEY_SIZE(k));
1779 }
1780
1781check_failed:
1782 if (op->type == BTREE_REPLACE) {
1783 if (!sectors_found) {
1784 op->insert_collision = true;
1785 return true;
1786 } else if (sectors_found < KEY_SIZE(insert)) {
1787 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
1788 (KEY_SIZE(insert) - sectors_found));
1789 SET_KEY_SIZE(insert, sectors_found);
1790 }
1791 }
1792
1793 return false;
1794}
1795
1796static bool btree_insert_key(struct btree *b, struct btree_op *op,
1797 struct bkey *k)
1798{
1799 struct bset *i = b->sets[b->nsets].data;
1800 struct bkey *m, *prev;
1801 const char *status = "insert";
1802
1803 BUG_ON(bkey_cmp(k, &b->key) > 0);
1804 BUG_ON(b->level && !KEY_PTRS(k));
1805 BUG_ON(!b->level && !KEY_OFFSET(k));
1806
1807 if (!b->level) {
1808 struct btree_iter iter;
1809 struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0);
1810
1811 /*
1812 * bset_search() returns the first key that is strictly greater
1813 * than the search key - but for back merging, we want to find
1814 * the first key that is greater than or equal to KEY_START(k) -
1815 * unless KEY_START(k) is 0.
1816 */
1817 if (KEY_OFFSET(&search))
1818 SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1);
1819
1820 prev = NULL;
1821 m = bch_btree_iter_init(b, &iter, &search);
1822
1823 if (fix_overlapping_extents(b, k, &iter, op))
1824 return false;
1825
1826 while (m != end(i) &&
1827 bkey_cmp(k, &START_KEY(m)) > 0)
1828 prev = m, m = bkey_next(m);
1829
1830 if (key_merging_disabled(b->c))
1831 goto insert;
1832
1833 /* prev is in the tree, if we merge we're done */
1834 status = "back merging";
1835 if (prev &&
1836 bch_bkey_try_merge(b, prev, k))
1837 goto merged;
1838
1839 status = "overwrote front";
1840 if (m != end(i) &&
1841 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
1842 goto copy;
1843
1844 status = "front merge";
1845 if (m != end(i) &&
1846 bch_bkey_try_merge(b, k, m))
1847 goto copy;
1848 } else
1849 m = bch_bset_search(b, &b->sets[b->nsets], k);
1850
1851insert: shift_keys(b, m, k);
1852copy: bkey_copy(m, k);
1853merged:
1854 bch_check_keys(b, "%s for %s at %s: %s", status,
1855 op_type(op), pbtree(b), pkey(k));
1856 bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status,
1857 op_type(op), pbtree(b), pkey(k));
1858
1859 if (b->level && !KEY_OFFSET(k))
1860 b->prio_blocked++;
1861
1862 pr_debug("%s for %s at %s: %s", status,
1863 op_type(op), pbtree(b), pkey(k));
1864
1865 return true;
1866}
1867
1868bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
1869{
1870 bool ret = false;
1871 struct bkey *k;
1872 unsigned oldsize = bch_count_data(b);
1873
1874 while ((k = bch_keylist_pop(&op->keys))) {
1875 bkey_put(b->c, k, b->level);
1876 ret |= btree_insert_key(b, op, k);
1877 }
1878
1879 BUG_ON(bch_count_data(b) < oldsize);
1880 return ret;
1881}
1882
1883bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
1884 struct bio *bio)
1885{
1886 bool ret = false;
1887 uint64_t btree_ptr = b->key.ptr[0];
1888 unsigned long seq = b->seq;
1889 BKEY_PADDED(k) tmp;
1890
1891 rw_unlock(false, b);
1892 rw_lock(true, b, b->level);
1893
1894 if (b->key.ptr[0] != btree_ptr ||
1895 b->seq != seq + 1 ||
1896 should_split(b))
1897 goto out;
1898
1899 op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio));
1900
1901 SET_KEY_PTRS(&op->replace, 1);
1902 get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
1903
1904 SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV);
1905
1906 bkey_copy(&tmp.k, &op->replace);
1907
1908 BUG_ON(op->type != BTREE_INSERT);
1909 BUG_ON(!btree_insert_key(b, op, &tmp.k));
1910 bch_btree_write(b, false, NULL);
1911 ret = true;
1912out:
1913 downgrade_write(&b->lock);
1914 return ret;
1915}
1916
1917static int btree_split(struct btree *b, struct btree_op *op)
1918{
1919 bool split, root = b == b->c->root;
1920 struct btree *n1, *n2 = NULL, *n3 = NULL;
1921 uint64_t start_time = local_clock();
1922
1923 if (b->level)
1924 set_closure_blocking(&op->cl);
1925
1926 n1 = btree_node_alloc_replacement(b, &op->cl);
1927 if (IS_ERR(n1))
1928 goto err;
1929
1930 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
1931
1932 pr_debug("%ssplitting at %s keys %i", split ? "" : "not ",
1933 pbtree(b), n1->sets[0].data->keys);
1934
1935 if (split) {
1936 unsigned keys = 0;
1937
1938 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
1939 if (IS_ERR(n2))
1940 goto err_free1;
1941
1942 if (root) {
1943 n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl);
1944 if (IS_ERR(n3))
1945 goto err_free2;
1946 }
1947
1948 bch_btree_insert_keys(n1, op);
1949
1950 /* Has to be a linear search because we don't have an auxiliary
1951 * search tree yet
1952 */
1953
1954 while (keys < (n1->sets[0].data->keys * 3) / 5)
1955 keys += bkey_u64s(node(n1->sets[0].data, keys));
1956
1957 bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
1958 keys += bkey_u64s(node(n1->sets[0].data, keys));
1959
1960 n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
1961 n1->sets[0].data->keys = keys;
1962
1963 memcpy(n2->sets[0].data->start,
1964 end(n1->sets[0].data),
1965 n2->sets[0].data->keys * sizeof(uint64_t));
1966
1967 bkey_copy_key(&n2->key, &b->key);
1968
1969 bch_keylist_add(&op->keys, &n2->key);
1970 bch_btree_write(n2, true, op);
1971 rw_unlock(true, n2);
1972 } else
1973 bch_btree_insert_keys(n1, op);
1974
1975 bch_keylist_add(&op->keys, &n1->key);
1976 bch_btree_write(n1, true, op);
1977
1978 if (n3) {
1979 bkey_copy_key(&n3->key, &MAX_KEY);
1980 bch_btree_insert_keys(n3, op);
1981 bch_btree_write(n3, true, op);
1982
1983 closure_sync(&op->cl);
1984 bch_btree_set_root(n3);
1985 rw_unlock(true, n3);
1986 } else if (root) {
1987 op->keys.top = op->keys.bottom;
1988 closure_sync(&op->cl);
1989 bch_btree_set_root(n1);
1990 } else {
1991 unsigned i;
1992
1993 bkey_copy(op->keys.top, &b->key);
1994 bkey_copy_key(op->keys.top, &ZERO_KEY);
1995
1996 for (i = 0; i < KEY_PTRS(&b->key); i++) {
1997 uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1;
1998
1999 SET_PTR_GEN(op->keys.top, i, g);
2000 }
2001
2002 bch_keylist_push(&op->keys);
2003 closure_sync(&op->cl);
2004 atomic_inc(&b->c->prio_blocked);
2005 }
2006
2007 rw_unlock(true, n1);
2008 btree_node_free(b, op);
2009
2010 bch_time_stats_update(&b->c->btree_split_time, start_time);
2011
2012 return 0;
2013err_free2:
2014 __bkey_put(n2->c, &n2->key);
2015 btree_node_free(n2, op);
2016 rw_unlock(true, n2);
2017err_free1:
2018 __bkey_put(n1->c, &n1->key);
2019 btree_node_free(n1, op);
2020 rw_unlock(true, n1);
2021err:
2022 if (n3 == ERR_PTR(-EAGAIN) ||
2023 n2 == ERR_PTR(-EAGAIN) ||
2024 n1 == ERR_PTR(-EAGAIN))
2025 return -EAGAIN;
2026
2027 pr_warn("couldn't split");
2028 return -ENOMEM;
2029}
2030
2031static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
2032 struct keylist *stack_keys)
2033{
2034 if (b->level) {
2035 int ret;
2036 struct bkey *insert = op->keys.bottom;
2037 struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert));
2038
2039 if (!k) {
2040 btree_bug(b, "no key to recurse on at level %i/%i",
2041 b->level, b->c->root->level);
2042
2043 op->keys.top = op->keys.bottom;
2044 return -EIO;
2045 }
2046
2047 if (bkey_cmp(insert, k) > 0) {
2048 unsigned i;
2049
2050 if (op->type == BTREE_REPLACE) {
2051 __bkey_put(b->c, insert);
2052 op->keys.top = op->keys.bottom;
2053 op->insert_collision = true;
2054 return 0;
2055 }
2056
2057 for (i = 0; i < KEY_PTRS(insert); i++)
2058 atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin);
2059
2060 bkey_copy(stack_keys->top, insert);
2061
2062 bch_cut_back(k, insert);
2063 bch_cut_front(k, stack_keys->top);
2064
2065 bch_keylist_push(stack_keys);
2066 }
2067
2068 ret = btree(insert_recurse, k, b, op, stack_keys);
2069 if (ret)
2070 return ret;
2071 }
2072
2073 if (!bch_keylist_empty(&op->keys)) {
2074 if (should_split(b)) {
2075 if (op->lock <= b->c->root->level) {
2076 BUG_ON(b->level);
2077 op->lock = b->c->root->level + 1;
2078 return -EINTR;
2079 }
2080 return btree_split(b, op);
2081 }
2082
2083 BUG_ON(write_block(b) != b->sets[b->nsets].data);
2084
2085 if (bch_btree_insert_keys(b, op))
2086 bch_btree_write(b, false, op);
2087 }
2088
2089 return 0;
2090}
2091
2092int bch_btree_insert(struct btree_op *op, struct cache_set *c)
2093{
2094 int ret = 0;
2095 struct keylist stack_keys;
2096
2097 /*
2098 * Don't want to block with the btree locked unless we have to,
2099 * otherwise we get deadlocks with try_harder and between split/gc
2100 */
2101 clear_closure_blocking(&op->cl);
2102
2103 BUG_ON(bch_keylist_empty(&op->keys));
2104 bch_keylist_copy(&stack_keys, &op->keys);
2105 bch_keylist_init(&op->keys);
2106
2107 while (!bch_keylist_empty(&stack_keys) ||
2108 !bch_keylist_empty(&op->keys)) {
2109 if (bch_keylist_empty(&op->keys)) {
2110 bch_keylist_add(&op->keys,
2111 bch_keylist_pop(&stack_keys));
2112 op->lock = 0;
2113 }
2114
2115 ret = btree_root(insert_recurse, c, op, &stack_keys);
2116
2117 if (ret == -EAGAIN) {
2118 ret = 0;
2119 closure_sync(&op->cl);
2120 } else if (ret) {
2121 struct bkey *k;
2122
2123 pr_err("error %i trying to insert key for %s",
2124 ret, op_type(op));
2125
2126 while ((k = bch_keylist_pop(&stack_keys) ?:
2127 bch_keylist_pop(&op->keys)))
2128 bkey_put(c, k, 0);
2129 }
2130 }
2131
2132 bch_keylist_free(&stack_keys);
2133
2134 if (op->journal)
2135 atomic_dec_bug(op->journal);
2136 op->journal = NULL;
2137 return ret;
2138}
2139
2140void bch_btree_set_root(struct btree *b)
2141{
2142 unsigned i;
2143
2144 BUG_ON(!b->written);
2145
2146 for (i = 0; i < KEY_PTRS(&b->key); i++)
2147 BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
2148
2149 mutex_lock(&b->c->bucket_lock);
2150 list_del_init(&b->list);
2151 mutex_unlock(&b->c->bucket_lock);
2152
2153 b->c->root = b;
2154 __bkey_put(b->c, &b->key);
2155
2156 bch_journal_meta(b->c, NULL);
2157 pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0));
2158}
2159
2160/* Cache lookup */
2161
2162static int submit_partial_cache_miss(struct btree *b, struct btree_op *op,
2163 struct bkey *k)
2164{
2165 struct search *s = container_of(op, struct search, op);
2166 struct bio *bio = &s->bio.bio;
2167 int ret = 0;
2168
2169 while (!ret &&
2170 !op->lookup_done) {
2171 unsigned sectors = INT_MAX;
2172
2173 if (KEY_INODE(k) == op->inode) {
2174 if (KEY_START(k) <= bio->bi_sector)
2175 break;
2176
2177 sectors = min_t(uint64_t, sectors,
2178 KEY_START(k) - bio->bi_sector);
2179 }
2180
2181 ret = s->d->cache_miss(b, s, bio, sectors);
2182 }
2183
2184 return ret;
2185}
2186
2187/*
2188 * Read from a single key, handling the initial cache miss if the key starts in
2189 * the middle of the bio
2190 */
2191static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
2192 struct bkey *k)
2193{
2194 struct search *s = container_of(op, struct search, op);
2195 struct bio *bio = &s->bio.bio;
2196 unsigned ptr;
2197 struct bio *n;
2198
2199 int ret = submit_partial_cache_miss(b, op, k);
2200 if (ret || op->lookup_done)
2201 return ret;
2202
2203 /* XXX: figure out best pointer - for multiple cache devices */
2204 ptr = 0;
2205
2206 PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
2207
2208 while (!op->lookup_done &&
2209 KEY_INODE(k) == op->inode &&
2210 bio->bi_sector < KEY_OFFSET(k)) {
2211 struct bkey *bio_key;
2212 sector_t sector = PTR_OFFSET(k, ptr) +
2213 (bio->bi_sector - KEY_START(k));
2214 unsigned sectors = min_t(uint64_t, INT_MAX,
2215 KEY_OFFSET(k) - bio->bi_sector);
2216
2217 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
2218 if (!n)
2219 return -EAGAIN;
2220
2221 if (n == bio)
2222 op->lookup_done = true;
2223
2224 bio_key = &container_of(n, struct bbio, bio)->key;
2225
2226 /*
2227 * The bucket we're reading from might be reused while our bio
2228 * is in flight, and we could then end up reading the wrong
2229 * data.
2230 *
2231 * We guard against this by checking (in cache_read_endio()) if
2232 * the pointer is stale again; if so, we treat it as an error
2233 * and reread from the backing device (but we don't pass that
2234 * error up anywhere).
2235 */
2236
2237 bch_bkey_copy_single_ptr(bio_key, k, ptr);
2238 SET_PTR_OFFSET(bio_key, 0, sector);
2239
2240 n->bi_end_io = bch_cache_read_endio;
2241 n->bi_private = &s->cl;
2242
2243 trace_bcache_cache_hit(n);
2244 __bch_submit_bbio(n, b->c);
2245 }
2246
2247 return 0;
2248}
2249
2250int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
2251{
2252 struct search *s = container_of(op, struct search, op);
2253 struct bio *bio = &s->bio.bio;
2254
2255 int ret = 0;
2256 struct bkey *k;
2257 struct btree_iter iter;
2258 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
2259
2260 pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode,
2261 (uint64_t) bio->bi_sector);
2262
2263 do {
2264 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
2265 if (!k) {
2266 /*
2267 * b->key would be exactly what we want, except that
2268 * pointers to btree nodes have nonzero size - we
2269 * wouldn't go far enough
2270 */
2271
2272 ret = submit_partial_cache_miss(b, op,
2273 &KEY(KEY_INODE(&b->key),
2274 KEY_OFFSET(&b->key), 0));
2275 break;
2276 }
2277
2278 ret = b->level
2279 ? btree(search_recurse, k, b, op)
2280 : submit_partial_cache_hit(b, op, k);
2281 } while (!ret &&
2282 !op->lookup_done);
2283
2284 return ret;
2285}
2286
2287/* Keybuf code */
2288
2289static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
2290{
2291 /* Overlapping keys compare equal */
2292 if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
2293 return -1;
2294 if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
2295 return 1;
2296 return 0;
2297}
2298
2299static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2300 struct keybuf_key *r)
2301{
2302 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
2303}
2304
2305static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
2306 struct keybuf *buf, struct bkey *end)
2307{
2308 struct btree_iter iter;
2309 bch_btree_iter_init(b, &iter, &buf->last_scanned);
2310
2311 while (!array_freelist_empty(&buf->freelist)) {
2312 struct bkey *k = bch_btree_iter_next_filter(&iter, b,
2313 bch_ptr_bad);
2314
2315 if (!b->level) {
2316 if (!k) {
2317 buf->last_scanned = b->key;
2318 break;
2319 }
2320
2321 buf->last_scanned = *k;
2322 if (bkey_cmp(&buf->last_scanned, end) >= 0)
2323 break;
2324
2325 if (buf->key_predicate(buf, k)) {
2326 struct keybuf_key *w;
2327
2328 pr_debug("%s", pkey(k));
2329
2330 spin_lock(&buf->lock);
2331
2332 w = array_alloc(&buf->freelist);
2333
2334 w->private = NULL;
2335 bkey_copy(&w->key, k);
2336
2337 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2338 array_free(&buf->freelist, w);
2339
2340 spin_unlock(&buf->lock);
2341 }
2342 } else {
2343 if (!k)
2344 break;
2345
2346 btree(refill_keybuf, k, b, op, buf, end);
2347 /*
2348 * Might get an error here, but can't really do anything
2349 * and it'll get logged elsewhere. Just read what we
2350 * can.
2351 */
2352
2353 if (bkey_cmp(&buf->last_scanned, end) >= 0)
2354 break;
2355
2356 cond_resched();
2357 }
2358 }
2359
2360 return 0;
2361}
2362
2363void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2364 struct bkey *end)
2365{
2366 struct bkey start = buf->last_scanned;
2367 struct btree_op op;
2368 bch_btree_op_init_stack(&op);
2369
2370 cond_resched();
2371
2372 btree_root(refill_keybuf, c, &op, buf, end);
2373 closure_sync(&op.cl);
2374
2375 pr_debug("found %s keys from %llu:%llu to %llu:%llu",
2376 RB_EMPTY_ROOT(&buf->keys) ? "no" :
2377 array_freelist_empty(&buf->freelist) ? "some" : "a few",
2378 KEY_INODE(&start), KEY_OFFSET(&start),
2379 KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned));
2380
2381 spin_lock(&buf->lock);
2382
2383 if (!RB_EMPTY_ROOT(&buf->keys)) {
2384 struct keybuf_key *w;
2385 w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2386 buf->start = START_KEY(&w->key);
2387
2388 w = RB_LAST(&buf->keys, struct keybuf_key, node);
2389 buf->end = w->key;
2390 } else {
2391 buf->start = MAX_KEY;
2392 buf->end = MAX_KEY;
2393 }
2394
2395 spin_unlock(&buf->lock);
2396}
2397
2398static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2399{
2400 rb_erase(&w->node, &buf->keys);
2401 array_free(&buf->freelist, w);
2402}
2403
2404void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2405{
2406 spin_lock(&buf->lock);
2407 __bch_keybuf_del(buf, w);
2408 spin_unlock(&buf->lock);
2409}
2410
2411bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
2412 struct bkey *end)
2413{
2414 bool ret = false;
2415 struct keybuf_key *p, *w, s;
2416 s.key = *start;
2417
2418 if (bkey_cmp(end, &buf->start) <= 0 ||
2419 bkey_cmp(start, &buf->end) >= 0)
2420 return false;
2421
2422 spin_lock(&buf->lock);
2423 w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
2424
2425 while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
2426 p = w;
2427 w = RB_NEXT(w, node);
2428
2429 if (p->private)
2430 ret = true;
2431 else
2432 __bch_keybuf_del(buf, p);
2433 }
2434
2435 spin_unlock(&buf->lock);
2436 return ret;
2437}
2438
2439struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2440{
2441 struct keybuf_key *w;
2442 spin_lock(&buf->lock);
2443
2444 w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2445
2446 while (w && w->private)
2447 w = RB_NEXT(w, node);
2448
2449 if (w)
2450 w->private = ERR_PTR(-EINTR);
2451
2452 spin_unlock(&buf->lock);
2453 return w;
2454}
2455
2456struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2457 struct keybuf *buf,
2458 struct bkey *end)
2459{
2460 struct keybuf_key *ret;
2461
2462 while (1) {
2463 ret = bch_keybuf_next(buf);
2464 if (ret)
2465 break;
2466
2467 if (bkey_cmp(&buf->last_scanned, end) >= 0) {
2468 pr_debug("scan finished");
2469 break;
2470 }
2471
2472 bch_refill_keybuf(c, buf, end);
2473 }
2474
2475 return ret;
2476}
2477
2478void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn)
2479{
2480 buf->key_predicate = fn;
2481 buf->last_scanned = MAX_KEY;
2482 buf->keys = RB_ROOT;
2483
2484 spin_lock_init(&buf->lock);
2485 array_allocator_init(&buf->freelist);
2486}
2487
2488void bch_btree_exit(void)
2489{
2490 if (btree_io_wq)
2491 destroy_workqueue(btree_io_wq);
2492 if (bch_gc_wq)
2493 destroy_workqueue(bch_gc_wq);
2494}
2495
2496int __init bch_btree_init(void)
2497{
2498 if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) ||
2499 !(btree_io_wq = create_singlethread_workqueue("bch_btree_io")))
2500 return -ENOMEM;
2501
2502 return 0;
2503}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
new file mode 100644
index 000000000000..af4a7092a28c
--- /dev/null
+++ b/drivers/md/bcache/btree.h
@@ -0,0 +1,405 @@
1#ifndef _BCACHE_BTREE_H
2#define _BCACHE_BTREE_H
3
4/*
5 * THE BTREE:
6 *
7 * At a high level, bcache's btree is relatively standard b+ tree. All keys and
8 * pointers are in the leaves; interior nodes only have pointers to the child
9 * nodes.
10 *
11 * In the interior nodes, a struct bkey always points to a child btree node, and
12 * the key is the highest key in the child node - except that the highest key in
13 * an interior node is always MAX_KEY. The size field refers to the size on disk
14 * of the child node - this would allow us to have variable sized btree nodes
15 * (handy for keeping the depth of the btree 1 by expanding just the root).
16 *
17 * Btree nodes are themselves log structured, but this is hidden fairly
18 * thoroughly. Btree nodes on disk will in practice have extents that overlap
19 * (because they were written at different times), but in memory we never have
20 * overlapping extents - when we read in a btree node from disk, the first thing
21 * we do is resort all the sets of keys with a mergesort, and in the same pass
22 * we check for overlapping extents and adjust them appropriately.
23 *
24 * struct btree_op is a central interface to the btree code. It's used for
25 * specifying read vs. write locking, and the embedded closure is used for
26 * waiting on IO or reserve memory.
27 *
28 * BTREE CACHE:
29 *
30 * Btree nodes are cached in memory; traversing the btree might require reading
31 * in btree nodes which is handled mostly transparently.
32 *
33 * bch_btree_node_get() looks up a btree node in the cache and reads it in from
34 * disk if necessary. This function is almost never called directly though - the
35 * btree() macro is used to get a btree node, call some function on it, and
36 * unlock the node after the function returns.
37 *
38 * The root is special cased - it's taken out of the cache's lru (thus pinning
39 * it in memory), so we can find the root of the btree by just dereferencing a
40 * pointer instead of looking it up in the cache. This makes locking a bit
41 * tricky, since the root pointer is protected by the lock in the btree node it
42 * points to - the btree_root() macro handles this.
43 *
44 * In various places we must be able to allocate memory for multiple btree nodes
45 * in order to make forward progress. To do this we use the btree cache itself
46 * as a reserve; if __get_free_pages() fails, we'll find a node in the btree
47 * cache we can reuse. We can't allow more than one thread to be doing this at a
48 * time, so there's a lock, implemented by a pointer to the btree_op closure -
49 * this allows the btree_root() macro to implicitly release this lock.
50 *
51 * BTREE IO:
52 *
53 * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles
54 * this.
55 *
56 * For writing, we have two btree_write structs embeddded in struct btree - one
57 * write in flight, and one being set up, and we toggle between them.
58 *
59 * Writing is done with a single function - bch_btree_write() really serves two
60 * different purposes and should be broken up into two different functions. When
61 * passing now = false, it merely indicates that the node is now dirty - calling
62 * it ensures that the dirty keys will be written at some point in the future.
63 *
64 * When passing now = true, bch_btree_write() causes a write to happen
65 * "immediately" (if there was already a write in flight, it'll cause the write
66 * to happen as soon as the previous write completes). It returns immediately
67 * though - but it takes a refcount on the closure in struct btree_op you passed
68 * to it, so a closure_sync() later can be used to wait for the write to
69 * complete.
70 *
71 * This is handy because btree_split() and garbage collection can issue writes
72 * in parallel, reducing the amount of time they have to hold write locks.
73 *
74 * LOCKING:
75 *
76 * When traversing the btree, we may need write locks starting at some level -
77 * inserting a key into the btree will typically only require a write lock on
78 * the leaf node.
79 *
80 * This is specified with the lock field in struct btree_op; lock = 0 means we
81 * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get()
82 * checks this field and returns the node with the appropriate lock held.
83 *
84 * If, after traversing the btree, the insertion code discovers it has to split
85 * then it must restart from the root and take new locks - to do this it changes
86 * the lock field and returns -EINTR, which causes the btree_root() macro to
87 * loop.
88 *
89 * Handling cache misses require a different mechanism for upgrading to a write
90 * lock. We do cache lookups with only a read lock held, but if we get a cache
91 * miss and we wish to insert this data into the cache, we have to insert a
92 * placeholder key to detect races - otherwise, we could race with a write and
93 * overwrite the data that was just written to the cache with stale data from
94 * the backing device.
95 *
96 * For this we use a sequence number that write locks and unlocks increment - to
97 * insert the check key it unlocks the btree node and then takes a write lock,
98 * and fails if the sequence number doesn't match.
99 */
100
101#include "bset.h"
102#include "debug.h"
103
104struct btree_write {
105 struct closure *owner;
106 atomic_t *journal;
107
108 /* If btree_split() frees a btree node, it writes a new pointer to that
109 * btree node indicating it was freed; it takes a refcount on
110 * c->prio_blocked because we can't write the gens until the new
111 * pointer is on disk. This allows btree_write_endio() to release the
112 * refcount that btree_split() took.
113 */
114 int prio_blocked;
115};
116
117struct btree {
118 /* Hottest entries first */
119 struct hlist_node hash;
120
121 /* Key/pointer for this btree node */
122 BKEY_PADDED(key);
123
124 /* Single bit - set when accessed, cleared by shrinker */
125 unsigned long accessed;
126 unsigned long seq;
127 struct rw_semaphore lock;
128 struct cache_set *c;
129
130 unsigned long flags;
131 uint16_t written; /* would be nice to kill */
132 uint8_t level;
133 uint8_t nsets;
134 uint8_t page_order;
135
136 /*
137 * Set of sorted keys - the real btree node - plus a binary search tree
138 *
139 * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
140 * to the memory we have allocated for this btree node. Additionally,
141 * set[0]->data points to the entire btree node as it exists on disk.
142 */
143 struct bset_tree sets[MAX_BSETS];
144
145 /* Used to refcount bio splits, also protects b->bio */
146 struct closure_with_waitlist io;
147
148 /* Gets transferred to w->prio_blocked - see the comment there */
149 int prio_blocked;
150
151 struct list_head list;
152 struct delayed_work work;
153
154 uint64_t io_start_time;
155 struct btree_write writes[2];
156 struct bio *bio;
157};
158
159#define BTREE_FLAG(flag) \
160static inline bool btree_node_ ## flag(struct btree *b) \
161{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
162 \
163static inline void set_btree_node_ ## flag(struct btree *b) \
164{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
165
166enum btree_flags {
167 BTREE_NODE_read_done,
168 BTREE_NODE_io_error,
169 BTREE_NODE_dirty,
170 BTREE_NODE_write_idx,
171};
172
173BTREE_FLAG(read_done);
174BTREE_FLAG(io_error);
175BTREE_FLAG(dirty);
176BTREE_FLAG(write_idx);
177
178static inline struct btree_write *btree_current_write(struct btree *b)
179{
180 return b->writes + btree_node_write_idx(b);
181}
182
183static inline struct btree_write *btree_prev_write(struct btree *b)
184{
185 return b->writes + (btree_node_write_idx(b) ^ 1);
186}
187
188static inline unsigned bset_offset(struct btree *b, struct bset *i)
189{
190 return (((size_t) i) - ((size_t) b->sets->data)) >> 9;
191}
192
193static inline struct bset *write_block(struct btree *b)
194{
195 return ((void *) b->sets[0].data) + b->written * block_bytes(b->c);
196}
197
198static inline bool bset_written(struct btree *b, struct bset_tree *t)
199{
200 return t->data < write_block(b);
201}
202
203static inline bool bkey_written(struct btree *b, struct bkey *k)
204{
205 return k < write_block(b)->start;
206}
207
208static inline void set_gc_sectors(struct cache_set *c)
209{
210 atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8);
211}
212
213static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
214{
215 return __bch_ptr_invalid(b->c, b->level, k);
216}
217
218static inline struct bkey *bch_btree_iter_init(struct btree *b,
219 struct btree_iter *iter,
220 struct bkey *search)
221{
222 return __bch_btree_iter_init(b, iter, search, b->sets);
223}
224
225/* Looping macros */
226
227#define for_each_cached_btree(b, c, iter) \
228 for (iter = 0; \
229 iter < ARRAY_SIZE((c)->bucket_hash); \
230 iter++) \
231 hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
232
233#define for_each_key_filter(b, k, iter, filter) \
234 for (bch_btree_iter_init((b), (iter), NULL); \
235 ((k) = bch_btree_iter_next_filter((iter), b, filter));)
236
237#define for_each_key(b, k, iter) \
238 for (bch_btree_iter_init((b), (iter), NULL); \
239 ((k) = bch_btree_iter_next(iter));)
240
241/* Recursing down the btree */
242
243struct btree_op {
244 struct closure cl;
245 struct cache_set *c;
246
247 /* Journal entry we have a refcount on */
248 atomic_t *journal;
249
250 /* Bio to be inserted into the cache */
251 struct bio *cache_bio;
252
253 unsigned inode;
254
255 uint16_t write_prio;
256
257 /* Btree level at which we start taking write locks */
258 short lock;
259
260 /* Btree insertion type */
261 enum {
262 BTREE_INSERT,
263 BTREE_REPLACE
264 } type:8;
265
266 unsigned csum:1;
267 unsigned skip:1;
268 unsigned flush_journal:1;
269
270 unsigned insert_data_done:1;
271 unsigned lookup_done:1;
272 unsigned insert_collision:1;
273
274 /* Anything after this point won't get zeroed in do_bio_hook() */
275
276 /* Keys to be inserted */
277 struct keylist keys;
278 BKEY_PADDED(replace);
279};
280
281void bch_btree_op_init_stack(struct btree_op *);
282
283static inline void rw_lock(bool w, struct btree *b, int level)
284{
285 w ? down_write_nested(&b->lock, level + 1)
286 : down_read_nested(&b->lock, level + 1);
287 if (w)
288 b->seq++;
289}
290
291static inline void rw_unlock(bool w, struct btree *b)
292{
293#ifdef CONFIG_BCACHE_EDEBUG
294 unsigned i;
295
296 if (w &&
297 b->key.ptr[0] &&
298 btree_node_read_done(b))
299 for (i = 0; i <= b->nsets; i++)
300 bch_check_key_order(b, b->sets[i].data);
301#endif
302
303 if (w)
304 b->seq++;
305 (w ? up_write : up_read)(&b->lock);
306}
307
308#define insert_lock(s, b) ((b)->level <= (s)->lock)
309
310/*
311 * These macros are for recursing down the btree - they handle the details of
312 * locking and looking up nodes in the cache for you. They're best treated as
313 * mere syntax when reading code that uses them.
314 *
315 * op->lock determines whether we take a read or a write lock at a given depth.
316 * If you've got a read lock and find that you need a write lock (i.e. you're
317 * going to have to split), set op->lock and return -EINTR; btree_root() will
318 * call you again and you'll have the correct lock.
319 */
320
321/**
322 * btree - recurse down the btree on a specified key
323 * @fn: function to call, which will be passed the child node
324 * @key: key to recurse on
325 * @b: parent btree node
326 * @op: pointer to struct btree_op
327 */
328#define btree(fn, key, b, op, ...) \
329({ \
330 int _r, l = (b)->level - 1; \
331 bool _w = l <= (op)->lock; \
332 struct btree *_b = bch_btree_node_get((b)->c, key, l, op); \
333 if (!IS_ERR(_b)) { \
334 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
335 rw_unlock(_w, _b); \
336 } else \
337 _r = PTR_ERR(_b); \
338 _r; \
339})
340
341/**
342 * btree_root - call a function on the root of the btree
343 * @fn: function to call, which will be passed the child node
344 * @c: cache set
345 * @op: pointer to struct btree_op
346 */
347#define btree_root(fn, c, op, ...) \
348({ \
349 int _r = -EINTR; \
350 do { \
351 struct btree *_b = (c)->root; \
352 bool _w = insert_lock(op, _b); \
353 rw_lock(_w, _b, _b->level); \
354 if (_b == (c)->root && \
355 _w == insert_lock(op, _b)) \
356 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
357 rw_unlock(_w, _b); \
358 bch_cannibalize_unlock(c, &(op)->cl); \
359 } while (_r == -EINTR); \
360 \
361 _r; \
362})
363
364static inline bool should_split(struct btree *b)
365{
366 struct bset *i = write_block(b);
367 return b->written >= btree_blocks(b) ||
368 (i->seq == b->sets[0].data->seq &&
369 b->written + __set_blocks(i, i->keys + 15, b->c)
370 > btree_blocks(b));
371}
372
373void bch_btree_read_done(struct closure *);
374void bch_btree_read(struct btree *);
375void bch_btree_write(struct btree *b, bool now, struct btree_op *op);
376
377void bch_cannibalize_unlock(struct cache_set *, struct closure *);
378void bch_btree_set_root(struct btree *);
379struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
380struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
381 int, struct btree_op *);
382
383bool bch_btree_insert_keys(struct btree *, struct btree_op *);
384bool bch_btree_insert_check_key(struct btree *, struct btree_op *,
385 struct bio *);
386int bch_btree_insert(struct btree_op *, struct cache_set *);
387
388int bch_btree_search_recurse(struct btree *, struct btree_op *);
389
390void bch_queue_gc(struct cache_set *);
391size_t bch_btree_gc_finish(struct cache_set *);
392void bch_moving_gc(struct closure *);
393int bch_btree_check(struct cache_set *, struct btree_op *);
394uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
395
396void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *);
397void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *);
398bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
399 struct bkey *);
400void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
401struct keybuf_key *bch_keybuf_next(struct keybuf *);
402struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *,
403 struct keybuf *, struct bkey *);
404
405#endif
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
new file mode 100644
index 000000000000..bd05a9a8c7cf
--- /dev/null
+++ b/drivers/md/bcache/closure.c
@@ -0,0 +1,345 @@
1/*
2 * Asynchronous refcounty things
3 *
4 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5 * Copyright 2012 Google, Inc.
6 */
7
8#include <linux/debugfs.h>
9#include <linux/module.h>
10#include <linux/seq_file.h>
11
12#include "closure.h"
13
14void closure_queue(struct closure *cl)
15{
16 struct workqueue_struct *wq = cl->wq;
17 if (wq) {
18 INIT_WORK(&cl->work, cl->work.func);
19 BUG_ON(!queue_work(wq, &cl->work));
20 } else
21 cl->fn(cl);
22}
23EXPORT_SYMBOL_GPL(closure_queue);
24
25#define CL_FIELD(type, field) \
26 case TYPE_ ## type: \
27 return &container_of(cl, struct type, cl)->field
28
29static struct closure_waitlist *closure_waitlist(struct closure *cl)
30{
31 switch (cl->type) {
32 CL_FIELD(closure_with_waitlist, wait);
33 CL_FIELD(closure_with_waitlist_and_timer, wait);
34 default:
35 return NULL;
36 }
37}
38
39static struct timer_list *closure_timer(struct closure *cl)
40{
41 switch (cl->type) {
42 CL_FIELD(closure_with_timer, timer);
43 CL_FIELD(closure_with_waitlist_and_timer, timer);
44 default:
45 return NULL;
46 }
47}
48
49static inline void closure_put_after_sub(struct closure *cl, int flags)
50{
51 int r = flags & CLOSURE_REMAINING_MASK;
52
53 BUG_ON(flags & CLOSURE_GUARD_MASK);
54 BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING)));
55
56 /* Must deliver precisely one wakeup */
57 if (r == 1 && (flags & CLOSURE_SLEEPING))
58 wake_up_process(cl->task);
59
60 if (!r) {
61 if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
62 /* CLOSURE_BLOCKING might be set - clear it */
63 atomic_set(&cl->remaining,
64 CLOSURE_REMAINING_INITIALIZER);
65 closure_queue(cl);
66 } else {
67 struct closure *parent = cl->parent;
68 struct closure_waitlist *wait = closure_waitlist(cl);
69
70 closure_debug_destroy(cl);
71
72 atomic_set(&cl->remaining, -1);
73
74 if (wait)
75 closure_wake_up(wait);
76
77 if (cl->fn)
78 cl->fn(cl);
79
80 if (parent)
81 closure_put(parent);
82 }
83 }
84}
85
86/* For clearing flags with the same atomic op as a put */
87void closure_sub(struct closure *cl, int v)
88{
89 closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
90}
91EXPORT_SYMBOL_GPL(closure_sub);
92
93void closure_put(struct closure *cl)
94{
95 closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
96}
97EXPORT_SYMBOL_GPL(closure_put);
98
99static void set_waiting(struct closure *cl, unsigned long f)
100{
101#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
102 cl->waiting_on = f;
103#endif
104}
105
106void __closure_wake_up(struct closure_waitlist *wait_list)
107{
108 struct llist_node *list;
109 struct closure *cl;
110 struct llist_node *reverse = NULL;
111
112 list = llist_del_all(&wait_list->list);
113
114 /* We first reverse the list to preserve FIFO ordering and fairness */
115
116 while (list) {
117 struct llist_node *t = list;
118 list = llist_next(list);
119
120 t->next = reverse;
121 reverse = t;
122 }
123
124 /* Then do the wakeups */
125
126 while (reverse) {
127 cl = container_of(reverse, struct closure, list);
128 reverse = llist_next(reverse);
129
130 set_waiting(cl, 0);
131 closure_sub(cl, CLOSURE_WAITING + 1);
132 }
133}
134EXPORT_SYMBOL_GPL(__closure_wake_up);
135
136bool closure_wait(struct closure_waitlist *list, struct closure *cl)
137{
138 if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
139 return false;
140
141 set_waiting(cl, _RET_IP_);
142 atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
143 llist_add(&cl->list, &list->list);
144
145 return true;
146}
147EXPORT_SYMBOL_GPL(closure_wait);
148
149/**
150 * closure_sync() - sleep until a closure a closure has nothing left to wait on
151 *
152 * Sleeps until the refcount hits 1 - the thread that's running the closure owns
153 * the last refcount.
154 */
155void closure_sync(struct closure *cl)
156{
157 while (1) {
158 __closure_start_sleep(cl);
159 closure_set_ret_ip(cl);
160
161 if ((atomic_read(&cl->remaining) &
162 CLOSURE_REMAINING_MASK) == 1)
163 break;
164
165 schedule();
166 }
167
168 __closure_end_sleep(cl);
169}
170EXPORT_SYMBOL_GPL(closure_sync);
171
172/**
173 * closure_trylock() - try to acquire the closure, without waiting
174 * @cl: closure to lock
175 *
176 * Returns true if the closure was succesfully locked.
177 */
178bool closure_trylock(struct closure *cl, struct closure *parent)
179{
180 if (atomic_cmpxchg(&cl->remaining, -1,
181 CLOSURE_REMAINING_INITIALIZER) != -1)
182 return false;
183
184 closure_set_ret_ip(cl);
185
186 smp_mb();
187 cl->parent = parent;
188 if (parent)
189 closure_get(parent);
190
191 closure_debug_create(cl);
192 return true;
193}
194EXPORT_SYMBOL_GPL(closure_trylock);
195
196void __closure_lock(struct closure *cl, struct closure *parent,
197 struct closure_waitlist *wait_list)
198{
199 struct closure wait;
200 closure_init_stack(&wait);
201
202 while (1) {
203 if (closure_trylock(cl, parent))
204 return;
205
206 closure_wait_event_sync(wait_list, &wait,
207 atomic_read(&cl->remaining) == -1);
208 }
209}
210EXPORT_SYMBOL_GPL(__closure_lock);
211
212static void closure_delay_timer_fn(unsigned long data)
213{
214 struct closure *cl = (struct closure *) data;
215 closure_sub(cl, CLOSURE_TIMER + 1);
216}
217
218void do_closure_timer_init(struct closure *cl)
219{
220 struct timer_list *timer = closure_timer(cl);
221
222 init_timer(timer);
223 timer->data = (unsigned long) cl;
224 timer->function = closure_delay_timer_fn;
225}
226EXPORT_SYMBOL_GPL(do_closure_timer_init);
227
228bool __closure_delay(struct closure *cl, unsigned long delay,
229 struct timer_list *timer)
230{
231 if (atomic_read(&cl->remaining) & CLOSURE_TIMER)
232 return false;
233
234 BUG_ON(timer_pending(timer));
235
236 timer->expires = jiffies + delay;
237
238 atomic_add(CLOSURE_TIMER + 1, &cl->remaining);
239 add_timer(timer);
240 return true;
241}
242EXPORT_SYMBOL_GPL(__closure_delay);
243
244void __closure_flush(struct closure *cl, struct timer_list *timer)
245{
246 if (del_timer(timer))
247 closure_sub(cl, CLOSURE_TIMER + 1);
248}
249EXPORT_SYMBOL_GPL(__closure_flush);
250
251void __closure_flush_sync(struct closure *cl, struct timer_list *timer)
252{
253 if (del_timer_sync(timer))
254 closure_sub(cl, CLOSURE_TIMER + 1);
255}
256EXPORT_SYMBOL_GPL(__closure_flush_sync);
257
258#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
259
260static LIST_HEAD(closure_list);
261static DEFINE_SPINLOCK(closure_list_lock);
262
263void closure_debug_create(struct closure *cl)
264{
265 unsigned long flags;
266
267 BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
268 cl->magic = CLOSURE_MAGIC_ALIVE;
269
270 spin_lock_irqsave(&closure_list_lock, flags);
271 list_add(&cl->all, &closure_list);
272 spin_unlock_irqrestore(&closure_list_lock, flags);
273}
274EXPORT_SYMBOL_GPL(closure_debug_create);
275
276void closure_debug_destroy(struct closure *cl)
277{
278 unsigned long flags;
279
280 BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
281 cl->magic = CLOSURE_MAGIC_DEAD;
282
283 spin_lock_irqsave(&closure_list_lock, flags);
284 list_del(&cl->all);
285 spin_unlock_irqrestore(&closure_list_lock, flags);
286}
287EXPORT_SYMBOL_GPL(closure_debug_destroy);
288
289static struct dentry *debug;
290
291#define work_data_bits(work) ((unsigned long *)(&(work)->data))
292
293static int debug_seq_show(struct seq_file *f, void *data)
294{
295 struct closure *cl;
296 spin_lock_irq(&closure_list_lock);
297
298 list_for_each_entry(cl, &closure_list, all) {
299 int r = atomic_read(&cl->remaining);
300
301 seq_printf(f, "%p: %pF -> %pf p %p r %i ",
302 cl, (void *) cl->ip, cl->fn, cl->parent,
303 r & CLOSURE_REMAINING_MASK);
304
305 seq_printf(f, "%s%s%s%s%s%s\n",
306 test_bit(WORK_STRUCT_PENDING,
307 work_data_bits(&cl->work)) ? "Q" : "",
308 r & CLOSURE_RUNNING ? "R" : "",
309 r & CLOSURE_BLOCKING ? "B" : "",
310 r & CLOSURE_STACK ? "S" : "",
311 r & CLOSURE_SLEEPING ? "Sl" : "",
312 r & CLOSURE_TIMER ? "T" : "");
313
314 if (r & CLOSURE_WAITING)
315 seq_printf(f, " W %pF\n",
316 (void *) cl->waiting_on);
317
318 seq_printf(f, "\n");
319 }
320
321 spin_unlock_irq(&closure_list_lock);
322 return 0;
323}
324
325static int debug_seq_open(struct inode *inode, struct file *file)
326{
327 return single_open(file, debug_seq_show, NULL);
328}
329
330static const struct file_operations debug_ops = {
331 .owner = THIS_MODULE,
332 .open = debug_seq_open,
333 .read = seq_read,
334 .release = single_release
335};
336
337void __init closure_debug_init(void)
338{
339 debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops);
340}
341
342#endif
343
344MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
345MODULE_LICENSE("GPL");
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
new file mode 100644
index 000000000000..00039924ea9d
--- /dev/null
+++ b/drivers/md/bcache/closure.h
@@ -0,0 +1,672 @@
1#ifndef _LINUX_CLOSURE_H
2#define _LINUX_CLOSURE_H
3
4#include <linux/llist.h>
5#include <linux/sched.h>
6#include <linux/workqueue.h>
7
8/*
9 * Closure is perhaps the most overused and abused term in computer science, but
10 * since I've been unable to come up with anything better you're stuck with it
11 * again.
12 *
13 * What are closures?
14 *
15 * They embed a refcount. The basic idea is they count "things that are in
16 * progress" - in flight bios, some other thread that's doing something else -
17 * anything you might want to wait on.
18 *
19 * The refcount may be manipulated with closure_get() and closure_put().
20 * closure_put() is where many of the interesting things happen, when it causes
21 * the refcount to go to 0.
22 *
23 * Closures can be used to wait on things both synchronously and asynchronously,
24 * and synchronous and asynchronous use can be mixed without restriction. To
25 * wait synchronously, use closure_sync() - you will sleep until your closure's
26 * refcount hits 1.
27 *
28 * To wait asynchronously, use
29 * continue_at(cl, next_function, workqueue);
30 *
31 * passing it, as you might expect, the function to run when nothing is pending
32 * and the workqueue to run that function out of.
33 *
34 * continue_at() also, critically, is a macro that returns the calling function.
35 * There's good reason for this.
36 *
37 * To use safely closures asynchronously, they must always have a refcount while
38 * they are running owned by the thread that is running them. Otherwise, suppose
39 * you submit some bios and wish to have a function run when they all complete:
40 *
41 * foo_endio(struct bio *bio, int error)
42 * {
43 * closure_put(cl);
44 * }
45 *
46 * closure_init(cl);
47 *
48 * do_stuff();
49 * closure_get(cl);
50 * bio1->bi_endio = foo_endio;
51 * bio_submit(bio1);
52 *
53 * do_more_stuff();
54 * closure_get(cl);
55 * bio2->bi_endio = foo_endio;
56 * bio_submit(bio2);
57 *
58 * continue_at(cl, complete_some_read, system_wq);
59 *
60 * If closure's refcount started at 0, complete_some_read() could run before the
61 * second bio was submitted - which is almost always not what you want! More
62 * importantly, it wouldn't be possible to say whether the original thread or
63 * complete_some_read()'s thread owned the closure - and whatever state it was
64 * associated with!
65 *
66 * So, closure_init() initializes a closure's refcount to 1 - and when a
67 * closure_fn is run, the refcount will be reset to 1 first.
68 *
69 * Then, the rule is - if you got the refcount with closure_get(), release it
70 * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
71 * on a closure because you called closure_init() or you were run out of a
72 * closure - _always_ use continue_at(). Doing so consistently will help
73 * eliminate an entire class of particularly pernicious races.
74 *
75 * For a closure to wait on an arbitrary event, we need to introduce waitlists:
76 *
77 * struct closure_waitlist list;
78 * closure_wait_event(list, cl, condition);
79 * closure_wake_up(wait_list);
80 *
81 * These work analagously to wait_event() and wake_up() - except that instead of
82 * operating on the current thread (for wait_event()) and lists of threads, they
83 * operate on an explicit closure and lists of closures.
84 *
85 * Because it's a closure we can now wait either synchronously or
86 * asynchronously. closure_wait_event() returns the current value of the
87 * condition, and if it returned false continue_at() or closure_sync() can be
88 * used to wait for it to become true.
89 *
90 * It's useful for waiting on things when you can't sleep in the context in
91 * which you must check the condition (perhaps a spinlock held, or you might be
92 * beneath generic_make_request() - in which case you can't sleep on IO).
93 *
94 * closure_wait_event() will wait either synchronously or asynchronously,
95 * depending on whether the closure is in blocking mode or not. You can pick a
96 * mode explicitly with closure_wait_event_sync() and
97 * closure_wait_event_async(), which do just what you might expect.
98 *
99 * Lastly, you might have a wait list dedicated to a specific event, and have no
100 * need for specifying the condition - you just want to wait until someone runs
101 * closure_wake_up() on the appropriate wait list. In that case, just use
102 * closure_wait(). It will return either true or false, depending on whether the
103 * closure was already on a wait list or not - a closure can only be on one wait
104 * list at a time.
105 *
106 * Parents:
107 *
108 * closure_init() takes two arguments - it takes the closure to initialize, and
109 * a (possibly null) parent.
110 *
111 * If parent is non null, the new closure will have a refcount for its lifetime;
112 * a closure is considered to be "finished" when its refcount hits 0 and the
113 * function to run is null. Hence
114 *
115 * continue_at(cl, NULL, NULL);
116 *
117 * returns up the (spaghetti) stack of closures, precisely like normal return
118 * returns up the C stack. continue_at() with non null fn is better thought of
119 * as doing a tail call.
120 *
121 * All this implies that a closure should typically be embedded in a particular
122 * struct (which its refcount will normally control the lifetime of), and that
123 * struct can very much be thought of as a stack frame.
124 *
125 * Locking:
126 *
127 * Closures are based on work items but they can be thought of as more like
128 * threads - in that like threads and unlike work items they have a well
129 * defined lifetime; they are created (with closure_init()) and eventually
130 * complete after a continue_at(cl, NULL, NULL).
131 *
132 * Suppose you've got some larger structure with a closure embedded in it that's
133 * used for periodically doing garbage collection. You only want one garbage
134 * collection happening at a time, so the natural thing to do is protect it with
135 * a lock. However, it's difficult to use a lock protecting a closure correctly
136 * because the unlock should come after the last continue_to() (additionally, if
137 * you're using the closure asynchronously a mutex won't work since a mutex has
138 * to be unlocked by the same process that locked it).
139 *
140 * So to make it less error prone and more efficient, we also have the ability
141 * to use closures as locks:
142 *
143 * closure_init_unlocked();
144 * closure_trylock();
145 *
146 * That's all we need for trylock() - the last closure_put() implicitly unlocks
147 * it for you. But for closure_lock(), we also need a wait list:
148 *
149 * struct closure_with_waitlist frobnicator_cl;
150 *
151 * closure_init_unlocked(&frobnicator_cl);
152 * closure_lock(&frobnicator_cl);
153 *
154 * A closure_with_waitlist embeds a closure and a wait list - much like struct
155 * delayed_work embeds a work item and a timer_list. The important thing is, use
156 * it exactly like you would a regular closure and closure_put() will magically
157 * handle everything for you.
158 *
159 * We've got closures that embed timers, too. They're called, appropriately
160 * enough:
161 * struct closure_with_timer;
162 *
163 * This gives you access to closure_delay(). It takes a refcount for a specified
164 * number of jiffies - you could then call closure_sync() (for a slightly
165 * convoluted version of msleep()) or continue_at() - which gives you the same
166 * effect as using a delayed work item, except you can reuse the work_struct
167 * already embedded in struct closure.
168 *
169 * Lastly, there's struct closure_with_waitlist_and_timer. It does what you
170 * probably expect, if you happen to need the features of both. (You don't
171 * really want to know how all this is implemented, but if I've done my job
172 * right you shouldn't have to care).
173 */
174
175struct closure;
176typedef void (closure_fn) (struct closure *);
177
178struct closure_waitlist {
179 struct llist_head list;
180};
181
182enum closure_type {
183 TYPE_closure = 0,
184 TYPE_closure_with_waitlist = 1,
185 TYPE_closure_with_timer = 2,
186 TYPE_closure_with_waitlist_and_timer = 3,
187 MAX_CLOSURE_TYPE = 3,
188};
189
190enum closure_state {
191 /*
192 * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of
193 * waiting asynchronously
194 *
195 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
196 * the thread that owns the closure, and cleared by the thread that's
197 * waking up the closure.
198 *
199 * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep
200 * - indicates that cl->task is valid and closure_put() may wake it up.
201 * Only set or cleared by the thread that owns the closure.
202 *
203 * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure
204 * has an outstanding timer. Must be set by the thread that owns the
205 * closure, and cleared by the timer function when the timer goes off.
206 *
207 * The rest are for debugging and don't affect behaviour:
208 *
209 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
210 * closure_init() and when closure_put() runs then next function), and
211 * must be cleared before remaining hits 0. Primarily to help guard
212 * against incorrect usage and accidentally transferring references.
213 * continue_at() and closure_return() clear it for you, if you're doing
214 * something unusual you can use closure_set_dead() which also helps
215 * annotate where references are being transferred.
216 *
217 * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a
218 * closure with this flag set
219 */
220
221 CLOSURE_BITS_START = (1 << 19),
222 CLOSURE_DESTRUCTOR = (1 << 19),
223 CLOSURE_BLOCKING = (1 << 21),
224 CLOSURE_WAITING = (1 << 23),
225 CLOSURE_SLEEPING = (1 << 25),
226 CLOSURE_TIMER = (1 << 27),
227 CLOSURE_RUNNING = (1 << 29),
228 CLOSURE_STACK = (1 << 31),
229};
230
231#define CLOSURE_GUARD_MASK \
232 ((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING| \
233 CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1)
234
235#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
236#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
237
238struct closure {
239 union {
240 struct {
241 struct workqueue_struct *wq;
242 struct task_struct *task;
243 struct llist_node list;
244 closure_fn *fn;
245 };
246 struct work_struct work;
247 };
248
249 struct closure *parent;
250
251 atomic_t remaining;
252
253 enum closure_type type;
254
255#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
256#define CLOSURE_MAGIC_DEAD 0xc054dead
257#define CLOSURE_MAGIC_ALIVE 0xc054a11e
258
259 unsigned magic;
260 struct list_head all;
261 unsigned long ip;
262 unsigned long waiting_on;
263#endif
264};
265
266struct closure_with_waitlist {
267 struct closure cl;
268 struct closure_waitlist wait;
269};
270
271struct closure_with_timer {
272 struct closure cl;
273 struct timer_list timer;
274};
275
276struct closure_with_waitlist_and_timer {
277 struct closure cl;
278 struct closure_waitlist wait;
279 struct timer_list timer;
280};
281
282extern unsigned invalid_closure_type(void);
283
284#define __CLOSURE_TYPE(cl, _t) \
285 __builtin_types_compatible_p(typeof(cl), struct _t) \
286 ? TYPE_ ## _t : \
287
288#define __closure_type(cl) \
289( \
290 __CLOSURE_TYPE(cl, closure) \
291 __CLOSURE_TYPE(cl, closure_with_waitlist) \
292 __CLOSURE_TYPE(cl, closure_with_timer) \
293 __CLOSURE_TYPE(cl, closure_with_waitlist_and_timer) \
294 invalid_closure_type() \
295)
296
297void closure_sub(struct closure *cl, int v);
298void closure_put(struct closure *cl);
299void closure_queue(struct closure *cl);
300void __closure_wake_up(struct closure_waitlist *list);
301bool closure_wait(struct closure_waitlist *list, struct closure *cl);
302void closure_sync(struct closure *cl);
303
304bool closure_trylock(struct closure *cl, struct closure *parent);
305void __closure_lock(struct closure *cl, struct closure *parent,
306 struct closure_waitlist *wait_list);
307
308void do_closure_timer_init(struct closure *cl);
309bool __closure_delay(struct closure *cl, unsigned long delay,
310 struct timer_list *timer);
311void __closure_flush(struct closure *cl, struct timer_list *timer);
312void __closure_flush_sync(struct closure *cl, struct timer_list *timer);
313
314#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
315
316void closure_debug_init(void);
317void closure_debug_create(struct closure *cl);
318void closure_debug_destroy(struct closure *cl);
319
320#else
321
322static inline void closure_debug_init(void) {}
323static inline void closure_debug_create(struct closure *cl) {}
324static inline void closure_debug_destroy(struct closure *cl) {}
325
326#endif
327
328static inline void closure_set_ip(struct closure *cl)
329{
330#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
331 cl->ip = _THIS_IP_;
332#endif
333}
334
335static inline void closure_set_ret_ip(struct closure *cl)
336{
337#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
338 cl->ip = _RET_IP_;
339#endif
340}
341
342static inline void closure_get(struct closure *cl)
343{
344#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
345 BUG_ON((atomic_inc_return(&cl->remaining) &
346 CLOSURE_REMAINING_MASK) <= 1);
347#else
348 atomic_inc(&cl->remaining);
349#endif
350}
351
352static inline void closure_set_stopped(struct closure *cl)
353{
354 atomic_sub(CLOSURE_RUNNING, &cl->remaining);
355}
356
357static inline bool closure_is_stopped(struct closure *cl)
358{
359 return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING);
360}
361
362static inline bool closure_is_unlocked(struct closure *cl)
363{
364 return atomic_read(&cl->remaining) == -1;
365}
366
367static inline void do_closure_init(struct closure *cl, struct closure *parent,
368 bool running)
369{
370 switch (cl->type) {
371 case TYPE_closure_with_timer:
372 case TYPE_closure_with_waitlist_and_timer:
373 do_closure_timer_init(cl);
374 default:
375 break;
376 }
377
378 cl->parent = parent;
379 if (parent)
380 closure_get(parent);
381
382 if (running) {
383 closure_debug_create(cl);
384 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
385 } else
386 atomic_set(&cl->remaining, -1);
387
388 closure_set_ip(cl);
389}
390
391/*
392 * Hack to get at the embedded closure if there is one, by doing an unsafe cast:
393 * the result of __closure_type() is thrown away, it's used merely for type
394 * checking.
395 */
396#define __to_internal_closure(cl) \
397({ \
398 BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE); \
399 (struct closure *) cl; \
400})
401
402#define closure_init_type(cl, parent, running) \
403do { \
404 struct closure *_cl = __to_internal_closure(cl); \
405 _cl->type = __closure_type(*(cl)); \
406 do_closure_init(_cl, parent, running); \
407} while (0)
408
409/**
410 * __closure_init() - Initialize a closure, skipping the memset()
411 *
412 * May be used instead of closure_init() when memory has already been zeroed.
413 */
414#define __closure_init(cl, parent) \
415 closure_init_type(cl, parent, true)
416
417/**
418 * closure_init() - Initialize a closure, setting the refcount to 1
419 * @cl: closure to initialize
420 * @parent: parent of the new closure. cl will take a refcount on it for its
421 * lifetime; may be NULL.
422 */
423#define closure_init(cl, parent) \
424do { \
425 memset((cl), 0, sizeof(*(cl))); \
426 __closure_init(cl, parent); \
427} while (0)
428
429static inline void closure_init_stack(struct closure *cl)
430{
431 memset(cl, 0, sizeof(struct closure));
432 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|
433 CLOSURE_BLOCKING|CLOSURE_STACK);
434}
435
436/**
437 * closure_init_unlocked() - Initialize a closure but leave it unlocked.
438 * @cl: closure to initialize
439 *
440 * For when the closure will be used as a lock. The closure may not be used
441 * until after a closure_lock() or closure_trylock().
442 */
443#define closure_init_unlocked(cl) \
444do { \
445 memset((cl), 0, sizeof(*(cl))); \
446 closure_init_type(cl, NULL, false); \
447} while (0)
448
449/**
450 * closure_lock() - lock and initialize a closure.
451 * @cl: the closure to lock
452 * @parent: the new parent for this closure
453 *
454 * The closure must be of one of the types that has a waitlist (otherwise we
455 * wouldn't be able to sleep on contention).
456 *
457 * @parent has exactly the same meaning as in closure_init(); if non null, the
458 * closure will take a reference on @parent which will be released when it is
459 * unlocked.
460 */
461#define closure_lock(cl, parent) \
462 __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
463
464/**
465 * closure_delay() - delay some number of jiffies
466 * @cl: the closure that will sleep
467 * @delay: the delay in jiffies
468 *
469 * Takes a refcount on @cl which will be released after @delay jiffies; this may
470 * be used to have a function run after a delay with continue_at(), or
471 * closure_sync() may be used for a convoluted version of msleep().
472 */
473#define closure_delay(cl, delay) \
474 __closure_delay(__to_internal_closure(cl), delay, &(cl)->timer)
475
476#define closure_flush(cl) \
477 __closure_flush(__to_internal_closure(cl), &(cl)->timer)
478
479#define closure_flush_sync(cl) \
480 __closure_flush_sync(__to_internal_closure(cl), &(cl)->timer)
481
482static inline void __closure_end_sleep(struct closure *cl)
483{
484 __set_current_state(TASK_RUNNING);
485
486 if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
487 atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
488}
489
490static inline void __closure_start_sleep(struct closure *cl)
491{
492 closure_set_ip(cl);
493 cl->task = current;
494 set_current_state(TASK_UNINTERRUPTIBLE);
495
496 if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
497 atomic_add(CLOSURE_SLEEPING, &cl->remaining);
498}
499
500/**
501 * closure_blocking() - returns true if the closure is in blocking mode.
502 *
503 * If a closure is in blocking mode, closure_wait_event() will sleep until the
504 * condition is true instead of waiting asynchronously.
505 */
506static inline bool closure_blocking(struct closure *cl)
507{
508 return atomic_read(&cl->remaining) & CLOSURE_BLOCKING;
509}
510
511/**
512 * set_closure_blocking() - put a closure in blocking mode.
513 *
514 * If a closure is in blocking mode, closure_wait_event() will sleep until the
515 * condition is true instead of waiting asynchronously.
516 *
517 * Not thread safe - can only be called by the thread running the closure.
518 */
519static inline void set_closure_blocking(struct closure *cl)
520{
521 if (!closure_blocking(cl))
522 atomic_add(CLOSURE_BLOCKING, &cl->remaining);
523}
524
525/*
526 * Not thread safe - can only be called by the thread running the closure.
527 */
528static inline void clear_closure_blocking(struct closure *cl)
529{
530 if (closure_blocking(cl))
531 atomic_sub(CLOSURE_BLOCKING, &cl->remaining);
532}
533
534/**
535 * closure_wake_up() - wake up all closures on a wait list.
536 */
537static inline void closure_wake_up(struct closure_waitlist *list)
538{
539 smp_mb();
540 __closure_wake_up(list);
541}
542
543/*
544 * Wait on an event, synchronously or asynchronously - analogous to wait_event()
545 * but for closures.
546 *
547 * The loop is oddly structured so as to avoid a race; we must check the
548 * condition again after we've added ourself to the waitlist. We know if we were
549 * already on the waitlist because closure_wait() returns false; thus, we only
550 * schedule or break if closure_wait() returns false. If it returns true, we
551 * just loop again - rechecking the condition.
552 *
553 * The __closure_wake_up() is necessary because we may race with the event
554 * becoming true; i.e. we see event false -> wait -> recheck condition, but the
555 * thread that made the event true may have called closure_wake_up() before we
556 * added ourself to the wait list.
557 *
558 * We have to call closure_sync() at the end instead of just
559 * __closure_end_sleep() because a different thread might've called
560 * closure_wake_up() before us and gotten preempted before they dropped the
561 * refcount on our closure. If this was a stack allocated closure, that would be
562 * bad.
563 */
564#define __closure_wait_event(list, cl, condition, _block) \
565({ \
566 bool block = _block; \
567 typeof(condition) ret; \
568 \
569 while (1) { \
570 ret = (condition); \
571 if (ret) { \
572 __closure_wake_up(list); \
573 if (block) \
574 closure_sync(cl); \
575 \
576 break; \
577 } \
578 \
579 if (block) \
580 __closure_start_sleep(cl); \
581 \
582 if (!closure_wait(list, cl)) { \
583 if (!block) \
584 break; \
585 \
586 schedule(); \
587 } \
588 } \
589 \
590 ret; \
591})
592
593/**
594 * closure_wait_event() - wait on a condition, synchronously or asynchronously.
595 * @list: the wait list to wait on
596 * @cl: the closure that is doing the waiting
597 * @condition: a C expression for the event to wait for
598 *
599 * If the closure is in blocking mode, sleeps until the @condition evaluates to
600 * true - exactly like wait_event().
601 *
602 * If the closure is not in blocking mode, waits asynchronously; if the
603 * condition is currently false the @cl is put onto @list and returns. @list
604 * owns a refcount on @cl; closure_sync() or continue_at() may be used later to
605 * wait for another thread to wake up @list, which drops the refcount on @cl.
606 *
607 * Returns the value of @condition; @cl will be on @list iff @condition was
608 * false.
609 *
610 * closure_wake_up(@list) must be called after changing any variable that could
611 * cause @condition to become true.
612 */
613#define closure_wait_event(list, cl, condition) \
614 __closure_wait_event(list, cl, condition, closure_blocking(cl))
615
616#define closure_wait_event_async(list, cl, condition) \
617 __closure_wait_event(list, cl, condition, false)
618
619#define closure_wait_event_sync(list, cl, condition) \
620 __closure_wait_event(list, cl, condition, true)
621
622static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
623 struct workqueue_struct *wq)
624{
625 BUG_ON(object_is_on_stack(cl));
626 closure_set_ip(cl);
627 cl->fn = fn;
628 cl->wq = wq;
629 /* between atomic_dec() in closure_put() */
630 smp_mb__before_atomic_dec();
631}
632
633#define continue_at(_cl, _fn, _wq) \
634do { \
635 set_closure_fn(_cl, _fn, _wq); \
636 closure_sub(_cl, CLOSURE_RUNNING + 1); \
637 return; \
638} while (0)
639
640#define closure_return(_cl) continue_at((_cl), NULL, NULL)
641
642#define continue_at_nobarrier(_cl, _fn, _wq) \
643do { \
644 set_closure_fn(_cl, _fn, _wq); \
645 closure_queue(cl); \
646 return; \
647} while (0)
648
649#define closure_return_with_destructor(_cl, _destructor) \
650do { \
651 set_closure_fn(_cl, _destructor, NULL); \
652 closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \
653 return; \
654} while (0)
655
656static inline void closure_call(struct closure *cl, closure_fn fn,
657 struct workqueue_struct *wq,
658 struct closure *parent)
659{
660 closure_init(cl, parent);
661 continue_at_nobarrier(cl, fn, wq);
662}
663
664static inline void closure_trylock_call(struct closure *cl, closure_fn fn,
665 struct workqueue_struct *wq,
666 struct closure *parent)
667{
668 if (closure_trylock(cl, parent))
669 continue_at_nobarrier(cl, fn, wq);
670}
671
672#endif /* _LINUX_CLOSURE_H */
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
new file mode 100644
index 000000000000..89fd5204924e
--- /dev/null
+++ b/drivers/md/bcache/debug.c
@@ -0,0 +1,565 @@
1/*
2 * Assorted bcache debug code
3 *
4 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5 * Copyright 2012 Google, Inc.
6 */
7
8#include "bcache.h"
9#include "btree.h"
10#include "debug.h"
11#include "request.h"
12
13#include <linux/console.h>
14#include <linux/debugfs.h>
15#include <linux/module.h>
16#include <linux/random.h>
17#include <linux/seq_file.h>
18
19static struct dentry *debug;
20
21const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
22{
23 unsigned i;
24
25 for (i = 0; i < KEY_PTRS(k); i++)
26 if (ptr_available(c, k, i)) {
27 struct cache *ca = PTR_CACHE(c, k, i);
28 size_t bucket = PTR_BUCKET_NR(c, k, i);
29 size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
30
31 if (KEY_SIZE(k) + r > c->sb.bucket_size)
32 return "bad, length too big";
33 if (bucket < ca->sb.first_bucket)
34 return "bad, short offset";
35 if (bucket >= ca->sb.nbuckets)
36 return "bad, offset past end of device";
37 if (ptr_stale(c, k, i))
38 return "stale";
39 }
40
41 if (!bkey_cmp(k, &ZERO_KEY))
42 return "bad, null key";
43 if (!KEY_PTRS(k))
44 return "bad, no pointers";
45 if (!KEY_SIZE(k))
46 return "zeroed key";
47 return "";
48}
49
50struct keyprint_hack bch_pkey(const struct bkey *k)
51{
52 unsigned i = 0;
53 struct keyprint_hack r;
54 char *out = r.s, *end = r.s + KEYHACK_SIZE;
55
56#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
57
58 p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k));
59
60 if (KEY_PTRS(k))
61 while (1) {
62 p("%llu:%llu gen %llu",
63 PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));
64
65 if (++i == KEY_PTRS(k))
66 break;
67
68 p(", ");
69 }
70
71 p("]");
72
73 if (KEY_DIRTY(k))
74 p(" dirty");
75 if (KEY_CSUM(k))
76 p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
77#undef p
78 return r;
79}
80
81struct keyprint_hack bch_pbtree(const struct btree *b)
82{
83 struct keyprint_hack r;
84
85 snprintf(r.s, 40, "%zu level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0),
86 b->level, b->c->root ? b->c->root->level : -1);
87 return r;
88}
89
90#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
91
92static bool skipped_backwards(struct btree *b, struct bkey *k)
93{
94 return bkey_cmp(k, (!b->level)
95 ? &START_KEY(bkey_next(k))
96 : bkey_next(k)) > 0;
97}
98
99static void dump_bset(struct btree *b, struct bset *i)
100{
101 struct bkey *k;
102 unsigned j;
103
104 for (k = i->start; k < end(i); k = bkey_next(k)) {
105 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
106 (uint64_t *) k - i->d, i->keys, pkey(k));
107
108 for (j = 0; j < KEY_PTRS(k); j++) {
109 size_t n = PTR_BUCKET_NR(b->c, k, j);
110 printk(" bucket %zu", n);
111
112 if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
113 printk(" prio %i",
114 PTR_BUCKET(b->c, k, j)->prio);
115 }
116
117 printk(" %s\n", bch_ptr_status(b->c, k));
118
119 if (bkey_next(k) < end(i) &&
120 skipped_backwards(b, k))
121 printk(KERN_ERR "Key skipped backwards\n");
122 }
123}
124
125#endif
126
127#ifdef CONFIG_BCACHE_DEBUG
128
129void bch_btree_verify(struct btree *b, struct bset *new)
130{
131 struct btree *v = b->c->verify_data;
132 struct closure cl;
133 closure_init_stack(&cl);
134
135 if (!b->c->verify)
136 return;
137
138 closure_wait_event(&b->io.wait, &cl,
139 atomic_read(&b->io.cl.remaining) == -1);
140
141 mutex_lock(&b->c->verify_lock);
142
143 bkey_copy(&v->key, &b->key);
144 v->written = 0;
145 v->level = b->level;
146
147 bch_btree_read(v);
148 closure_wait_event(&v->io.wait, &cl,
149 atomic_read(&b->io.cl.remaining) == -1);
150
151 if (new->keys != v->sets[0].data->keys ||
152 memcmp(new->start,
153 v->sets[0].data->start,
154 (void *) end(new) - (void *) new->start)) {
155 unsigned i, j;
156
157 console_lock();
158
159 printk(KERN_ERR "*** original memory node:\n");
160 for (i = 0; i <= b->nsets; i++)
161 dump_bset(b, b->sets[i].data);
162
163 printk(KERN_ERR "*** sorted memory node:\n");
164 dump_bset(b, new);
165
166 printk(KERN_ERR "*** on disk node:\n");
167 dump_bset(v, v->sets[0].data);
168
169 for (j = 0; j < new->keys; j++)
170 if (new->d[j] != v->sets[0].data->d[j])
171 break;
172
173 console_unlock();
174 panic("verify failed at %u\n", j);
175 }
176
177 mutex_unlock(&b->c->verify_lock);
178}
179
180static void data_verify_endio(struct bio *bio, int error)
181{
182 struct closure *cl = bio->bi_private;
183 closure_put(cl);
184}
185
186void bch_data_verify(struct search *s)
187{
188 char name[BDEVNAME_SIZE];
189 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
190 struct closure *cl = &s->cl;
191 struct bio *check;
192 struct bio_vec *bv;
193 int i;
194
195 if (!s->unaligned_bvec)
196 bio_for_each_segment(bv, s->orig_bio, i)
197 bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
198
199 check = bio_clone(s->orig_bio, GFP_NOIO);
200 if (!check)
201 return;
202
203 if (bch_bio_alloc_pages(check, GFP_NOIO))
204 goto out_put;
205
206 check->bi_rw = READ_SYNC;
207 check->bi_private = cl;
208 check->bi_end_io = data_verify_endio;
209
210 closure_bio_submit(check, cl, &dc->disk);
211 closure_sync(cl);
212
213 bio_for_each_segment(bv, s->orig_bio, i) {
214 void *p1 = kmap(bv->bv_page);
215 void *p2 = kmap(check->bi_io_vec[i].bv_page);
216
217 if (memcmp(p1 + bv->bv_offset,
218 p2 + bv->bv_offset,
219 bv->bv_len))
220 printk(KERN_ERR
221 "bcache (%s): verify failed at sector %llu\n",
222 bdevname(dc->bdev, name),
223 (uint64_t) s->orig_bio->bi_sector);
224
225 kunmap(bv->bv_page);
226 kunmap(check->bi_io_vec[i].bv_page);
227 }
228
229 __bio_for_each_segment(bv, check, i, 0)
230 __free_page(bv->bv_page);
231out_put:
232 bio_put(check);
233}
234
235#endif
236
237#ifdef CONFIG_BCACHE_EDEBUG
238
239unsigned bch_count_data(struct btree *b)
240{
241 unsigned ret = 0;
242 struct btree_iter iter;
243 struct bkey *k;
244
245 if (!b->level)
246 for_each_key(b, k, &iter)
247 ret += KEY_SIZE(k);
248 return ret;
249}
250
251static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
252 va_list args)
253{
254 unsigned i;
255
256 console_lock();
257
258 for (i = 0; i <= b->nsets; i++)
259 dump_bset(b, b->sets[i].data);
260
261 vprintk(fmt, args);
262
263 console_unlock();
264
265 panic("at %s\n", pbtree(b));
266}
267
268void bch_check_key_order_msg(struct btree *b, struct bset *i,
269 const char *fmt, ...)
270{
271 struct bkey *k;
272
273 if (!i->keys)
274 return;
275
276 for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k))
277 if (skipped_backwards(b, k)) {
278 va_list args;
279 va_start(args, fmt);
280
281 vdump_bucket_and_panic(b, fmt, args);
282 va_end(args);
283 }
284}
285
286void bch_check_keys(struct btree *b, const char *fmt, ...)
287{
288 va_list args;
289 struct bkey *k, *p = NULL;
290 struct btree_iter iter;
291
292 if (b->level)
293 return;
294
295 for_each_key(b, k, &iter) {
296 if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) {
297 printk(KERN_ERR "Keys out of order:\n");
298 goto bug;
299 }
300
301 if (bch_ptr_invalid(b, k))
302 continue;
303
304 if (p && bkey_cmp(p, &START_KEY(k)) > 0) {
305 printk(KERN_ERR "Overlapping keys:\n");
306 goto bug;
307 }
308 p = k;
309 }
310 return;
311bug:
312 va_start(args, fmt);
313 vdump_bucket_and_panic(b, fmt, args);
314 va_end(args);
315}
316
317#endif
318
319#ifdef CONFIG_DEBUG_FS
320
321/* XXX: cache set refcounting */
322
323struct dump_iterator {
324 char buf[PAGE_SIZE];
325 size_t bytes;
326 struct cache_set *c;
327 struct keybuf keys;
328};
329
330static bool dump_pred(struct keybuf *buf, struct bkey *k)
331{
332 return true;
333}
334
335static ssize_t bch_dump_read(struct file *file, char __user *buf,
336 size_t size, loff_t *ppos)
337{
338 struct dump_iterator *i = file->private_data;
339 ssize_t ret = 0;
340
341 while (size) {
342 struct keybuf_key *w;
343 unsigned bytes = min(i->bytes, size);
344
345 int err = copy_to_user(buf, i->buf, bytes);
346 if (err)
347 return err;
348
349 ret += bytes;
350 buf += bytes;
351 size -= bytes;
352 i->bytes -= bytes;
353 memmove(i->buf, i->buf + bytes, i->bytes);
354
355 if (i->bytes)
356 break;
357
358 w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY);
359 if (!w)
360 break;
361
362 i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key));
363 bch_keybuf_del(&i->keys, w);
364 }
365
366 return ret;
367}
368
369static int bch_dump_open(struct inode *inode, struct file *file)
370{
371 struct cache_set *c = inode->i_private;
372 struct dump_iterator *i;
373
374 i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL);
375 if (!i)
376 return -ENOMEM;
377
378 file->private_data = i;
379 i->c = c;
380 bch_keybuf_init(&i->keys, dump_pred);
381 i->keys.last_scanned = KEY(0, 0, 0);
382
383 return 0;
384}
385
386static int bch_dump_release(struct inode *inode, struct file *file)
387{
388 kfree(file->private_data);
389 return 0;
390}
391
392static const struct file_operations cache_set_debug_ops = {
393 .owner = THIS_MODULE,
394 .open = bch_dump_open,
395 .read = bch_dump_read,
396 .release = bch_dump_release
397};
398
399void bch_debug_init_cache_set(struct cache_set *c)
400{
401 if (!IS_ERR_OR_NULL(debug)) {
402 char name[50];
403 snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);
404
405 c->debug = debugfs_create_file(name, 0400, debug, c,
406 &cache_set_debug_ops);
407 }
408}
409
410#endif
411
412/* Fuzz tester has rotted: */
413#if 0
414
415static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a,
416 const char *buffer, size_t size)
417{
418 void dump(struct btree *b)
419 {
420 struct bset *i;
421
422 for (i = b->sets[0].data;
423 index(i, b) < btree_blocks(b) &&
424 i->seq == b->sets[0].data->seq;
425 i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c))
426 dump_bset(b, i);
427 }
428
429 struct cache_sb *sb;
430 struct cache_set *c;
431 struct btree *all[3], *b, *fill, *orig;
432 int j;
433
434 struct btree_op op;
435 bch_btree_op_init_stack(&op);
436
437 sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL);
438 if (!sb)
439 return -ENOMEM;
440
441 sb->bucket_size = 128;
442 sb->block_size = 4;
443
444 c = bch_cache_set_alloc(sb);
445 if (!c)
446 return -ENOMEM;
447
448 for (j = 0; j < 3; j++) {
449 BUG_ON(list_empty(&c->btree_cache));
450 all[j] = list_first_entry(&c->btree_cache, struct btree, list);
451 list_del_init(&all[j]->list);
452
453 all[j]->key = KEY(0, 0, c->sb.bucket_size);
454 bkey_copy_key(&all[j]->key, &MAX_KEY);
455 }
456
457 b = all[0];
458 fill = all[1];
459 orig = all[2];
460
461 while (1) {
462 for (j = 0; j < 3; j++)
463 all[j]->written = all[j]->nsets = 0;
464
465 bch_bset_init_next(b);
466
467 while (1) {
468 struct bset *i = write_block(b);
469 struct bkey *k = op.keys.top;
470 unsigned rand;
471
472 bkey_init(k);
473 rand = get_random_int();
474
475 op.type = rand & 1
476 ? BTREE_INSERT
477 : BTREE_REPLACE;
478 rand >>= 1;
479
480 SET_KEY_SIZE(k, bucket_remainder(c, rand));
481 rand >>= c->bucket_bits;
482 rand &= 1024 * 512 - 1;
483 rand += c->sb.bucket_size;
484 SET_KEY_OFFSET(k, rand);
485#if 0
486 SET_KEY_PTRS(k, 1);
487#endif
488 bch_keylist_push(&op.keys);
489 bch_btree_insert_keys(b, &op);
490
491 if (should_split(b) ||
492 set_blocks(i, b->c) !=
493 __set_blocks(i, i->keys + 15, b->c)) {
494 i->csum = csum_set(i);
495
496 memcpy(write_block(fill),
497 i, set_bytes(i));
498
499 b->written += set_blocks(i, b->c);
500 fill->written = b->written;
501 if (b->written == btree_blocks(b))
502 break;
503
504 bch_btree_sort_lazy(b);
505 bch_bset_init_next(b);
506 }
507 }
508
509 memcpy(orig->sets[0].data,
510 fill->sets[0].data,
511 btree_bytes(c));
512
513 bch_btree_sort(b);
514 fill->written = 0;
515 bch_btree_read_done(&fill->io.cl);
516
517 if (b->sets[0].data->keys != fill->sets[0].data->keys ||
518 memcmp(b->sets[0].data->start,
519 fill->sets[0].data->start,
520 b->sets[0].data->keys * sizeof(uint64_t))) {
521 struct bset *i = b->sets[0].data;
522 struct bkey *k, *l;
523
524 for (k = i->start,
525 l = fill->sets[0].data->start;
526 k < end(i);
527 k = bkey_next(k), l = bkey_next(l))
528 if (bkey_cmp(k, l) ||
529 KEY_SIZE(k) != KEY_SIZE(l))
530 pr_err("key %zi differs: %s != %s",
531 (uint64_t *) k - i->d,
532 pkey(k), pkey(l));
533
534 for (j = 0; j < 3; j++) {
535 pr_err("**** Set %i ****", j);
536 dump(all[j]);
537 }
538 panic("\n");
539 }
540
541 pr_info("fuzz complete: %i keys", b->sets[0].data->keys);
542 }
543}
544
545kobj_attribute_write(fuzz, btree_fuzz);
546#endif
547
548void bch_debug_exit(void)
549{
550 if (!IS_ERR_OR_NULL(debug))
551 debugfs_remove_recursive(debug);
552}
553
554int __init bch_debug_init(struct kobject *kobj)
555{
556 int ret = 0;
557#if 0
558 ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr);
559 if (ret)
560 return ret;
561#endif
562
563 debug = debugfs_create_dir("bcache", NULL);
564 return ret;
565}
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
new file mode 100644
index 000000000000..f9378a218148
--- /dev/null
+++ b/drivers/md/bcache/debug.h
@@ -0,0 +1,54 @@
1#ifndef _BCACHE_DEBUG_H
2#define _BCACHE_DEBUG_H
3
4/* Btree/bkey debug printing */
5
6#define KEYHACK_SIZE 80
7struct keyprint_hack {
8 char s[KEYHACK_SIZE];
9};
10
11struct keyprint_hack bch_pkey(const struct bkey *k);
12struct keyprint_hack bch_pbtree(const struct btree *b);
13#define pkey(k) (&bch_pkey(k).s[0])
14#define pbtree(b) (&bch_pbtree(b).s[0])
15
16#ifdef CONFIG_BCACHE_EDEBUG
17
18unsigned bch_count_data(struct btree *);
19void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...);
20void bch_check_keys(struct btree *, const char *, ...);
21
22#define bch_check_key_order(b, i) \
23 bch_check_key_order_msg(b, i, "keys out of order")
24#define EBUG_ON(cond) BUG_ON(cond)
25
26#else /* EDEBUG */
27
28#define bch_count_data(b) 0
29#define bch_check_key_order(b, i) do {} while (0)
30#define bch_check_key_order_msg(b, i, ...) do {} while (0)
31#define bch_check_keys(b, ...) do {} while (0)
32#define EBUG_ON(cond) do {} while (0)
33
34#endif
35
36#ifdef CONFIG_BCACHE_DEBUG
37
38void bch_btree_verify(struct btree *, struct bset *);
39void bch_data_verify(struct search *);
40
41#else /* DEBUG */
42
43static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
44static inline void bch_data_verify(struct search *s) {};
45
46#endif
47
48#ifdef CONFIG_DEBUG_FS
49void bch_debug_init_cache_set(struct cache_set *);
50#else
51static inline void bch_debug_init_cache_set(struct cache_set *c) {}
52#endif
53
54#endif
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
new file mode 100644
index 000000000000..48efd4dea645
--- /dev/null
+++ b/drivers/md/bcache/io.c
@@ -0,0 +1,397 @@
1/*
2 * Some low level IO code, and hacks for various block layer limitations
3 *
4 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5 * Copyright 2012 Google, Inc.
6 */
7
8#include "bcache.h"
9#include "bset.h"
10#include "debug.h"
11
12static void bch_bi_idx_hack_endio(struct bio *bio, int error)
13{
14 struct bio *p = bio->bi_private;
15
16 bio_endio(p, error);
17 bio_put(bio);
18}
19
20static void bch_generic_make_request_hack(struct bio *bio)
21{
22 if (bio->bi_idx) {
23 struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio));
24
25 memcpy(clone->bi_io_vec,
26 bio_iovec(bio),
27 bio_segments(bio) * sizeof(struct bio_vec));
28
29 clone->bi_sector = bio->bi_sector;
30 clone->bi_bdev = bio->bi_bdev;
31 clone->bi_rw = bio->bi_rw;
32 clone->bi_vcnt = bio_segments(bio);
33 clone->bi_size = bio->bi_size;
34
35 clone->bi_private = bio;
36 clone->bi_end_io = bch_bi_idx_hack_endio;
37
38 bio = clone;
39 }
40
41 /*
42 * Hack, since drivers that clone bios clone up to bi_max_vecs, but our
43 * bios might have had more than that (before we split them per device
44 * limitations).
45 *
46 * To be taken out once immutable bvec stuff is in.
47 */
48 bio->bi_max_vecs = bio->bi_vcnt;
49
50 generic_make_request(bio);
51}
52
53/**
54 * bch_bio_split - split a bio
55 * @bio: bio to split
56 * @sectors: number of sectors to split from the front of @bio
57 * @gfp: gfp mask
58 * @bs: bio set to allocate from
59 *
60 * Allocates and returns a new bio which represents @sectors from the start of
61 * @bio, and updates @bio to represent the remaining sectors.
62 *
63 * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio
64 * unchanged.
65 *
66 * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
67 * bvec boundry; it is the caller's responsibility to ensure that @bio is not
68 * freed before the split.
69 *
70 * If bch_bio_split() is running under generic_make_request(), it's not safe to
71 * allocate more than one bio from the same bio set. Therefore, if it is running
72 * under generic_make_request() it masks out __GFP_WAIT when doing the
73 * allocation. The caller must check for failure if there's any possibility of
74 * it being called from under generic_make_request(); it is then the caller's
75 * responsibility to retry from a safe context (by e.g. punting to workqueue).
76 */
77struct bio *bch_bio_split(struct bio *bio, int sectors,
78 gfp_t gfp, struct bio_set *bs)
79{
80 unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9;
81 struct bio_vec *bv;
82 struct bio *ret = NULL;
83
84 BUG_ON(sectors <= 0);
85
86 /*
87 * If we're being called from underneath generic_make_request() and we
88 * already allocated any bios from this bio set, we risk deadlock if we
89 * use the mempool. So instead, we possibly fail and let the caller punt
90 * to workqueue or somesuch and retry in a safe context.
91 */
92 if (current->bio_list)
93 gfp &= ~__GFP_WAIT;
94
95 if (sectors >= bio_sectors(bio))
96 return bio;
97
98 if (bio->bi_rw & REQ_DISCARD) {
99 ret = bio_alloc_bioset(gfp, 1, bs);
100 idx = 0;
101 goto out;
102 }
103
104 bio_for_each_segment(bv, bio, idx) {
105 vcnt = idx - bio->bi_idx;
106
107 if (!nbytes) {
108 ret = bio_alloc_bioset(gfp, vcnt, bs);
109 if (!ret)
110 return NULL;
111
112 memcpy(ret->bi_io_vec, bio_iovec(bio),
113 sizeof(struct bio_vec) * vcnt);
114
115 break;
116 } else if (nbytes < bv->bv_len) {
117 ret = bio_alloc_bioset(gfp, ++vcnt, bs);
118 if (!ret)
119 return NULL;
120
121 memcpy(ret->bi_io_vec, bio_iovec(bio),
122 sizeof(struct bio_vec) * vcnt);
123
124 ret->bi_io_vec[vcnt - 1].bv_len = nbytes;
125 bv->bv_offset += nbytes;
126 bv->bv_len -= nbytes;
127 break;
128 }
129
130 nbytes -= bv->bv_len;
131 }
132out:
133 ret->bi_bdev = bio->bi_bdev;
134 ret->bi_sector = bio->bi_sector;
135 ret->bi_size = sectors << 9;
136 ret->bi_rw = bio->bi_rw;
137 ret->bi_vcnt = vcnt;
138 ret->bi_max_vecs = vcnt;
139
140 bio->bi_sector += sectors;
141 bio->bi_size -= sectors << 9;
142 bio->bi_idx = idx;
143
144 if (bio_integrity(bio)) {
145 if (bio_integrity_clone(ret, bio, gfp)) {
146 bio_put(ret);
147 return NULL;
148 }
149
150 bio_integrity_trim(ret, 0, bio_sectors(ret));
151 bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio));
152 }
153
154 return ret;
155}
156
157static unsigned bch_bio_max_sectors(struct bio *bio)
158{
159 unsigned ret = bio_sectors(bio);
160 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
161 unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES,
162 queue_max_segments(q));
163 struct bio_vec *bv, *end = bio_iovec(bio) +
164 min_t(int, bio_segments(bio), max_segments);
165
166 if (bio->bi_rw & REQ_DISCARD)
167 return min(ret, q->limits.max_discard_sectors);
168
169 if (bio_segments(bio) > max_segments ||
170 q->merge_bvec_fn) {
171 ret = 0;
172
173 for (bv = bio_iovec(bio); bv < end; bv++) {
174 struct bvec_merge_data bvm = {
175 .bi_bdev = bio->bi_bdev,
176 .bi_sector = bio->bi_sector,
177 .bi_size = ret << 9,
178 .bi_rw = bio->bi_rw,
179 };
180
181 if (q->merge_bvec_fn &&
182 q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
183 break;
184
185 ret += bv->bv_len >> 9;
186 }
187 }
188
189 ret = min(ret, queue_max_sectors(q));
190
191 WARN_ON(!ret);
192 ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9);
193
194 return ret;
195}
196
197static void bch_bio_submit_split_done(struct closure *cl)
198{
199 struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
200
201 s->bio->bi_end_io = s->bi_end_io;
202 s->bio->bi_private = s->bi_private;
203 bio_endio(s->bio, 0);
204
205 closure_debug_destroy(&s->cl);
206 mempool_free(s, s->p->bio_split_hook);
207}
208
209static void bch_bio_submit_split_endio(struct bio *bio, int error)
210{
211 struct closure *cl = bio->bi_private;
212 struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
213
214 if (error)
215 clear_bit(BIO_UPTODATE, &s->bio->bi_flags);
216
217 bio_put(bio);
218 closure_put(cl);
219}
220
221static void __bch_bio_submit_split(struct closure *cl)
222{
223 struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
224 struct bio *bio = s->bio, *n;
225
226 do {
227 n = bch_bio_split(bio, bch_bio_max_sectors(bio),
228 GFP_NOIO, s->p->bio_split);
229 if (!n)
230 continue_at(cl, __bch_bio_submit_split, system_wq);
231
232 n->bi_end_io = bch_bio_submit_split_endio;
233 n->bi_private = cl;
234
235 closure_get(cl);
236 bch_generic_make_request_hack(n);
237 } while (n != bio);
238
239 continue_at(cl, bch_bio_submit_split_done, NULL);
240}
241
242void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
243{
244 struct bio_split_hook *s;
245
246 if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
247 goto submit;
248
249 if (bio_sectors(bio) <= bch_bio_max_sectors(bio))
250 goto submit;
251
252 s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
253
254 s->bio = bio;
255 s->p = p;
256 s->bi_end_io = bio->bi_end_io;
257 s->bi_private = bio->bi_private;
258 bio_get(bio);
259
260 closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL);
261 return;
262submit:
263 bch_generic_make_request_hack(bio);
264}
265
266/* Bios with headers */
267
268void bch_bbio_free(struct bio *bio, struct cache_set *c)
269{
270 struct bbio *b = container_of(bio, struct bbio, bio);
271 mempool_free(b, c->bio_meta);
272}
273
274struct bio *bch_bbio_alloc(struct cache_set *c)
275{
276 struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO);
277 struct bio *bio = &b->bio;
278
279 bio_init(bio);
280 bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
281 bio->bi_max_vecs = bucket_pages(c);
282 bio->bi_io_vec = bio->bi_inline_vecs;
283
284 return bio;
285}
286
287void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
288{
289 struct bbio *b = container_of(bio, struct bbio, bio);
290
291 bio->bi_sector = PTR_OFFSET(&b->key, 0);
292 bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev;
293
294 b->submit_time_us = local_clock_us();
295 closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0));
296}
297
298void bch_submit_bbio(struct bio *bio, struct cache_set *c,
299 struct bkey *k, unsigned ptr)
300{
301 struct bbio *b = container_of(bio, struct bbio, bio);
302 bch_bkey_copy_single_ptr(&b->key, k, ptr);
303 __bch_submit_bbio(bio, c);
304}
305
306/* IO errors */
307
308void bch_count_io_errors(struct cache *ca, int error, const char *m)
309{
310 /*
311 * The halflife of an error is:
312 * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh
313 */
314
315 if (ca->set->error_decay) {
316 unsigned count = atomic_inc_return(&ca->io_count);
317
318 while (count > ca->set->error_decay) {
319 unsigned errors;
320 unsigned old = count;
321 unsigned new = count - ca->set->error_decay;
322
323 /*
324 * First we subtract refresh from count; each time we
325 * succesfully do so, we rescale the errors once:
326 */
327
328 count = atomic_cmpxchg(&ca->io_count, old, new);
329
330 if (count == old) {
331 count = new;
332
333 errors = atomic_read(&ca->io_errors);
334 do {
335 old = errors;
336 new = ((uint64_t) errors * 127) / 128;
337 errors = atomic_cmpxchg(&ca->io_errors,
338 old, new);
339 } while (old != errors);
340 }
341 }
342 }
343
344 if (error) {
345 char buf[BDEVNAME_SIZE];
346 unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT,
347 &ca->io_errors);
348 errors >>= IO_ERROR_SHIFT;
349
350 if (errors < ca->set->error_limit)
351 pr_err("%s: IO error on %s, recovering",
352 bdevname(ca->bdev, buf), m);
353 else
354 bch_cache_set_error(ca->set,
355 "%s: too many IO errors %s",
356 bdevname(ca->bdev, buf), m);
357 }
358}
359
360void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
361 int error, const char *m)
362{
363 struct bbio *b = container_of(bio, struct bbio, bio);
364 struct cache *ca = PTR_CACHE(c, &b->key, 0);
365
366 unsigned threshold = bio->bi_rw & REQ_WRITE
367 ? c->congested_write_threshold_us
368 : c->congested_read_threshold_us;
369
370 if (threshold) {
371 unsigned t = local_clock_us();
372
373 int us = t - b->submit_time_us;
374 int congested = atomic_read(&c->congested);
375
376 if (us > (int) threshold) {
377 int ms = us / 1024;
378 c->congested_last_us = t;
379
380 ms = min(ms, CONGESTED_MAX + congested);
381 atomic_sub(ms, &c->congested);
382 } else if (congested < 0)
383 atomic_inc(&c->congested);
384 }
385
386 bch_count_io_errors(ca, error, m);
387}
388
389void bch_bbio_endio(struct cache_set *c, struct bio *bio,
390 int error, const char *m)
391{
392 struct closure *cl = bio->bi_private;
393
394 bch_bbio_count_io_errors(c, bio, error, m);
395 bio_put(bio);
396 closure_put(cl);
397}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
new file mode 100644
index 000000000000..8c8dfdcd9d4c
--- /dev/null
+++ b/drivers/md/bcache/journal.c
@@ -0,0 +1,787 @@
1/*
2 * bcache journalling code, for btree insertions
3 *
4 * Copyright 2012 Google, Inc.
5 */
6
7#include "bcache.h"
8#include "btree.h"
9#include "debug.h"
10#include "request.h"
11
12/*
13 * Journal replay/recovery:
14 *
15 * This code is all driven from run_cache_set(); we first read the journal
16 * entries, do some other stuff, then we mark all the keys in the journal
17 * entries (same as garbage collection would), then we replay them - reinserting
18 * them into the cache in precisely the same order as they appear in the
19 * journal.
20 *
21 * We only journal keys that go in leaf nodes, which simplifies things quite a
22 * bit.
23 */
24
25static void journal_read_endio(struct bio *bio, int error)
26{
27 struct closure *cl = bio->bi_private;
28 closure_put(cl);
29}
30
31static int journal_read_bucket(struct cache *ca, struct list_head *list,
32 struct btree_op *op, unsigned bucket_index)
33{
34 struct journal_device *ja = &ca->journal;
35 struct bio *bio = &ja->bio;
36
37 struct journal_replay *i;
38 struct jset *j, *data = ca->set->journal.w[0].data;
39 unsigned len, left, offset = 0;
40 int ret = 0;
41 sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
42
43 pr_debug("reading %llu", (uint64_t) bucket);
44
45 while (offset < ca->sb.bucket_size) {
46reread: left = ca->sb.bucket_size - offset;
47 len = min_t(unsigned, left, PAGE_SECTORS * 8);
48
49 bio_reset(bio);
50 bio->bi_sector = bucket + offset;
51 bio->bi_bdev = ca->bdev;
52 bio->bi_rw = READ;
53 bio->bi_size = len << 9;
54
55 bio->bi_end_io = journal_read_endio;
56 bio->bi_private = &op->cl;
57 bch_bio_map(bio, data);
58
59 closure_bio_submit(bio, &op->cl, ca);
60 closure_sync(&op->cl);
61
62 /* This function could be simpler now since we no longer write
63 * journal entries that overlap bucket boundaries; this means
64 * the start of a bucket will always have a valid journal entry
65 * if it has any journal entries at all.
66 */
67
68 j = data;
69 while (len) {
70 struct list_head *where;
71 size_t blocks, bytes = set_bytes(j);
72
73 if (j->magic != jset_magic(ca->set))
74 return ret;
75
76 if (bytes > left << 9)
77 return ret;
78
79 if (bytes > len << 9)
80 goto reread;
81
82 if (j->csum != csum_set(j))
83 return ret;
84
85 blocks = set_blocks(j, ca->set);
86
87 while (!list_empty(list)) {
88 i = list_first_entry(list,
89 struct journal_replay, list);
90 if (i->j.seq >= j->last_seq)
91 break;
92 list_del(&i->list);
93 kfree(i);
94 }
95
96 list_for_each_entry_reverse(i, list, list) {
97 if (j->seq == i->j.seq)
98 goto next_set;
99
100 if (j->seq < i->j.last_seq)
101 goto next_set;
102
103 if (j->seq > i->j.seq) {
104 where = &i->list;
105 goto add;
106 }
107 }
108
109 where = list;
110add:
111 i = kmalloc(offsetof(struct journal_replay, j) +
112 bytes, GFP_KERNEL);
113 if (!i)
114 return -ENOMEM;
115 memcpy(&i->j, j, bytes);
116 list_add(&i->list, where);
117 ret = 1;
118
119 ja->seq[bucket_index] = j->seq;
120next_set:
121 offset += blocks * ca->sb.block_size;
122 len -= blocks * ca->sb.block_size;
123 j = ((void *) j) + blocks * block_bytes(ca);
124 }
125 }
126
127 return ret;
128}
129
130int bch_journal_read(struct cache_set *c, struct list_head *list,
131 struct btree_op *op)
132{
133#define read_bucket(b) \
134 ({ \
135 int ret = journal_read_bucket(ca, list, op, b); \
136 __set_bit(b, bitmap); \
137 if (ret < 0) \
138 return ret; \
139 ret; \
140 })
141
142 struct cache *ca;
143 unsigned iter;
144
145 for_each_cache(ca, c, iter) {
146 struct journal_device *ja = &ca->journal;
147 unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG];
148 unsigned i, l, r, m;
149 uint64_t seq;
150
151 bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
152 pr_debug("%u journal buckets", ca->sb.njournal_buckets);
153
154 /* Read journal buckets ordered by golden ratio hash to quickly
155 * find a sequence of buckets with valid journal entries
156 */
157 for (i = 0; i < ca->sb.njournal_buckets; i++) {
158 l = (i * 2654435769U) % ca->sb.njournal_buckets;
159
160 if (test_bit(l, bitmap))
161 break;
162
163 if (read_bucket(l))
164 goto bsearch;
165 }
166
167 /* If that fails, check all the buckets we haven't checked
168 * already
169 */
170 pr_debug("falling back to linear search");
171
172 for (l = 0; l < ca->sb.njournal_buckets; l++) {
173 if (test_bit(l, bitmap))
174 continue;
175
176 if (read_bucket(l))
177 goto bsearch;
178 }
179bsearch:
180 /* Binary search */
181 m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
182 pr_debug("starting binary search, l %u r %u", l, r);
183
184 while (l + 1 < r) {
185 m = (l + r) >> 1;
186
187 if (read_bucket(m))
188 l = m;
189 else
190 r = m;
191 }
192
193 /* Read buckets in reverse order until we stop finding more
194 * journal entries
195 */
196 pr_debug("finishing up");
197 l = m;
198
199 while (1) {
200 if (!l--)
201 l = ca->sb.njournal_buckets - 1;
202
203 if (l == m)
204 break;
205
206 if (test_bit(l, bitmap))
207 continue;
208
209 if (!read_bucket(l))
210 break;
211 }
212
213 seq = 0;
214
215 for (i = 0; i < ca->sb.njournal_buckets; i++)
216 if (ja->seq[i] > seq) {
217 seq = ja->seq[i];
218 ja->cur_idx = ja->discard_idx =
219 ja->last_idx = i;
220
221 }
222 }
223
224 c->journal.seq = list_entry(list->prev,
225 struct journal_replay,
226 list)->j.seq;
227
228 return 0;
229#undef read_bucket
230}
231
232void bch_journal_mark(struct cache_set *c, struct list_head *list)
233{
234 atomic_t p = { 0 };
235 struct bkey *k;
236 struct journal_replay *i;
237 struct journal *j = &c->journal;
238 uint64_t last = j->seq;
239
240 /*
241 * journal.pin should never fill up - we never write a journal
242 * entry when it would fill up. But if for some reason it does, we
243 * iterate over the list in reverse order so that we can just skip that
244 * refcount instead of bugging.
245 */
246
247 list_for_each_entry_reverse(i, list, list) {
248 BUG_ON(last < i->j.seq);
249 i->pin = NULL;
250
251 while (last-- != i->j.seq)
252 if (fifo_free(&j->pin) > 1) {
253 fifo_push_front(&j->pin, p);
254 atomic_set(&fifo_front(&j->pin), 0);
255 }
256
257 if (fifo_free(&j->pin) > 1) {
258 fifo_push_front(&j->pin, p);
259 i->pin = &fifo_front(&j->pin);
260 atomic_set(i->pin, 1);
261 }
262
263 for (k = i->j.start;
264 k < end(&i->j);
265 k = bkey_next(k)) {
266 unsigned j;
267
268 for (j = 0; j < KEY_PTRS(k); j++) {
269 struct bucket *g = PTR_BUCKET(c, k, j);
270 atomic_inc(&g->pin);
271
272 if (g->prio == BTREE_PRIO &&
273 !ptr_stale(c, k, j))
274 g->prio = INITIAL_PRIO;
275 }
276
277 __bch_btree_mark_key(c, 0, k);
278 }
279 }
280}
281
282int bch_journal_replay(struct cache_set *s, struct list_head *list,
283 struct btree_op *op)
284{
285 int ret = 0, keys = 0, entries = 0;
286 struct bkey *k;
287 struct journal_replay *i =
288 list_entry(list->prev, struct journal_replay, list);
289
290 uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
291
292 list_for_each_entry(i, list, list) {
293 BUG_ON(i->pin && atomic_read(i->pin) != 1);
294
295 if (n != i->j.seq)
296 pr_err(
297 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n",
298 n, i->j.seq - 1, start, end);
299
300 for (k = i->j.start;
301 k < end(&i->j);
302 k = bkey_next(k)) {
303 pr_debug("%s", pkey(k));
304 bkey_copy(op->keys.top, k);
305 bch_keylist_push(&op->keys);
306
307 op->journal = i->pin;
308 atomic_inc(op->journal);
309
310 ret = bch_btree_insert(op, s);
311 if (ret)
312 goto err;
313
314 BUG_ON(!bch_keylist_empty(&op->keys));
315 keys++;
316
317 cond_resched();
318 }
319
320 if (i->pin)
321 atomic_dec(i->pin);
322 n = i->j.seq + 1;
323 entries++;
324 }
325
326 pr_info("journal replay done, %i keys in %i entries, seq %llu",
327 keys, entries, end);
328
329 while (!list_empty(list)) {
330 i = list_first_entry(list, struct journal_replay, list);
331 list_del(&i->list);
332 kfree(i);
333 }
334err:
335 closure_sync(&op->cl);
336 return ret;
337}
338
339/* Journalling */
340
341static void btree_flush_write(struct cache_set *c)
342{
343 /*
344 * Try to find the btree node with that references the oldest journal
345 * entry, best is our current candidate and is locked if non NULL:
346 */
347 struct btree *b, *best = NULL;
348 unsigned iter;
349
350 for_each_cached_btree(b, c, iter) {
351 if (!down_write_trylock(&b->lock))
352 continue;
353
354 if (!btree_node_dirty(b) ||
355 !btree_current_write(b)->journal) {
356 rw_unlock(true, b);
357 continue;
358 }
359
360 if (!best)
361 best = b;
362 else if (journal_pin_cmp(c,
363 btree_current_write(best),
364 btree_current_write(b))) {
365 rw_unlock(true, best);
366 best = b;
367 } else
368 rw_unlock(true, b);
369 }
370
371 if (best)
372 goto out;
373
374 /* We can't find the best btree node, just pick the first */
375 list_for_each_entry(b, &c->btree_cache, list)
376 if (!b->level && btree_node_dirty(b)) {
377 best = b;
378 rw_lock(true, best, best->level);
379 goto found;
380 }
381
382out:
383 if (!best)
384 return;
385found:
386 if (btree_node_dirty(best))
387 bch_btree_write(best, true, NULL);
388 rw_unlock(true, best);
389}
390
391#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
392
393static void journal_discard_endio(struct bio *bio, int error)
394{
395 struct journal_device *ja =
396 container_of(bio, struct journal_device, discard_bio);
397 struct cache *ca = container_of(ja, struct cache, journal);
398
399 atomic_set(&ja->discard_in_flight, DISCARD_DONE);
400
401 closure_wake_up(&ca->set->journal.wait);
402 closure_put(&ca->set->cl);
403}
404
405static void journal_discard_work(struct work_struct *work)
406{
407 struct journal_device *ja =
408 container_of(work, struct journal_device, discard_work);
409
410 submit_bio(0, &ja->discard_bio);
411}
412
413static void do_journal_discard(struct cache *ca)
414{
415 struct journal_device *ja = &ca->journal;
416 struct bio *bio = &ja->discard_bio;
417
418 if (!ca->discard) {
419 ja->discard_idx = ja->last_idx;
420 return;
421 }
422
423 switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) {
424 case DISCARD_IN_FLIGHT:
425 return;
426
427 case DISCARD_DONE:
428 ja->discard_idx = (ja->discard_idx + 1) %
429 ca->sb.njournal_buckets;
430
431 atomic_set(&ja->discard_in_flight, DISCARD_READY);
432 /* fallthrough */
433
434 case DISCARD_READY:
435 if (ja->discard_idx == ja->last_idx)
436 return;
437
438 atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
439
440 bio_init(bio);
441 bio->bi_sector = bucket_to_sector(ca->set,
442 ca->sb.d[ja->discard_idx]);
443 bio->bi_bdev = ca->bdev;
444 bio->bi_rw = REQ_WRITE|REQ_DISCARD;
445 bio->bi_max_vecs = 1;
446 bio->bi_io_vec = bio->bi_inline_vecs;
447 bio->bi_size = bucket_bytes(ca);
448 bio->bi_end_io = journal_discard_endio;
449
450 closure_get(&ca->set->cl);
451 INIT_WORK(&ja->discard_work, journal_discard_work);
452 schedule_work(&ja->discard_work);
453 }
454}
455
456static void journal_reclaim(struct cache_set *c)
457{
458 struct bkey *k = &c->journal.key;
459 struct cache *ca;
460 uint64_t last_seq;
461 unsigned iter, n = 0;
462 atomic_t p;
463
464 while (!atomic_read(&fifo_front(&c->journal.pin)))
465 fifo_pop(&c->journal.pin, p);
466
467 last_seq = last_seq(&c->journal);
468
469 /* Update last_idx */
470
471 for_each_cache(ca, c, iter) {
472 struct journal_device *ja = &ca->journal;
473
474 while (ja->last_idx != ja->cur_idx &&
475 ja->seq[ja->last_idx] < last_seq)
476 ja->last_idx = (ja->last_idx + 1) %
477 ca->sb.njournal_buckets;
478 }
479
480 for_each_cache(ca, c, iter)
481 do_journal_discard(ca);
482
483 if (c->journal.blocks_free)
484 return;
485
486 /*
487 * Allocate:
488 * XXX: Sort by free journal space
489 */
490
491 for_each_cache(ca, c, iter) {
492 struct journal_device *ja = &ca->journal;
493 unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
494
495 /* No space available on this device */
496 if (next == ja->discard_idx)
497 continue;
498
499 ja->cur_idx = next;
500 k->ptr[n++] = PTR(0,
501 bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
502 ca->sb.nr_this_dev);
503 }
504
505 bkey_init(k);
506 SET_KEY_PTRS(k, n);
507
508 if (n)
509 c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
510
511 if (!journal_full(&c->journal))
512 __closure_wake_up(&c->journal.wait);
513}
514
515void bch_journal_next(struct journal *j)
516{
517 atomic_t p = { 1 };
518
519 j->cur = (j->cur == j->w)
520 ? &j->w[1]
521 : &j->w[0];
522
523 /*
524 * The fifo_push() needs to happen at the same time as j->seq is
525 * incremented for last_seq() to be calculated correctly
526 */
527 BUG_ON(!fifo_push(&j->pin, p));
528 atomic_set(&fifo_back(&j->pin), 1);
529
530 j->cur->data->seq = ++j->seq;
531 j->cur->need_write = false;
532 j->cur->data->keys = 0;
533
534 if (fifo_full(&j->pin))
535 pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
536}
537
538static void journal_write_endio(struct bio *bio, int error)
539{
540 struct journal_write *w = bio->bi_private;
541
542 cache_set_err_on(error, w->c, "journal io error");
543 closure_put(&w->c->journal.io.cl);
544}
545
546static void journal_write(struct closure *);
547
548static void journal_write_done(struct closure *cl)
549{
550 struct journal *j = container_of(cl, struct journal, io.cl);
551 struct cache_set *c = container_of(j, struct cache_set, journal);
552
553 struct journal_write *w = (j->cur == j->w)
554 ? &j->w[1]
555 : &j->w[0];
556
557 __closure_wake_up(&w->wait);
558
559 if (c->journal_delay_ms)
560 closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms));
561
562 continue_at(cl, journal_write, system_wq);
563}
564
565static void journal_write_unlocked(struct closure *cl)
566 __releases(c->journal.lock)
567{
568 struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
569 struct cache *ca;
570 struct journal_write *w = c->journal.cur;
571 struct bkey *k = &c->journal.key;
572 unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size;
573
574 struct bio *bio;
575 struct bio_list list;
576 bio_list_init(&list);
577
578 if (!w->need_write) {
579 /*
580 * XXX: have to unlock closure before we unlock journal lock,
581 * else we race with bch_journal(). But this way we race
582 * against cache set unregister. Doh.
583 */
584 set_closure_fn(cl, NULL, NULL);
585 closure_sub(cl, CLOSURE_RUNNING + 1);
586 spin_unlock(&c->journal.lock);
587 return;
588 } else if (journal_full(&c->journal)) {
589 journal_reclaim(c);
590 spin_unlock(&c->journal.lock);
591
592 btree_flush_write(c);
593 continue_at(cl, journal_write, system_wq);
594 }
595
596 c->journal.blocks_free -= set_blocks(w->data, c);
597
598 w->data->btree_level = c->root->level;
599
600 bkey_copy(&w->data->btree_root, &c->root->key);
601 bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
602
603 for_each_cache(ca, c, i)
604 w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
605
606 w->data->magic = jset_magic(c);
607 w->data->version = BCACHE_JSET_VERSION;
608 w->data->last_seq = last_seq(&c->journal);
609 w->data->csum = csum_set(w->data);
610
611 for (i = 0; i < KEY_PTRS(k); i++) {
612 ca = PTR_CACHE(c, k, i);
613 bio = &ca->journal.bio;
614
615 atomic_long_add(sectors, &ca->meta_sectors_written);
616
617 bio_reset(bio);
618 bio->bi_sector = PTR_OFFSET(k, i);
619 bio->bi_bdev = ca->bdev;
620 bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH;
621 bio->bi_size = sectors << 9;
622
623 bio->bi_end_io = journal_write_endio;
624 bio->bi_private = w;
625 bch_bio_map(bio, w->data);
626
627 trace_bcache_journal_write(bio);
628 bio_list_add(&list, bio);
629
630 SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
631
632 ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
633 }
634
635 atomic_dec_bug(&fifo_back(&c->journal.pin));
636 bch_journal_next(&c->journal);
637 journal_reclaim(c);
638
639 spin_unlock(&c->journal.lock);
640
641 while ((bio = bio_list_pop(&list)))
642 closure_bio_submit(bio, cl, c->cache[0]);
643
644 continue_at(cl, journal_write_done, NULL);
645}
646
647static void journal_write(struct closure *cl)
648{
649 struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
650
651 spin_lock(&c->journal.lock);
652 journal_write_unlocked(cl);
653}
654
655static void __journal_try_write(struct cache_set *c, bool noflush)
656 __releases(c->journal.lock)
657{
658 struct closure *cl = &c->journal.io.cl;
659
660 if (!closure_trylock(cl, &c->cl))
661 spin_unlock(&c->journal.lock);
662 else if (noflush && journal_full(&c->journal)) {
663 spin_unlock(&c->journal.lock);
664 continue_at(cl, journal_write, system_wq);
665 } else
666 journal_write_unlocked(cl);
667}
668
669#define journal_try_write(c) __journal_try_write(c, false)
670
671void bch_journal_meta(struct cache_set *c, struct closure *cl)
672{
673 struct journal_write *w;
674
675 if (CACHE_SYNC(&c->sb)) {
676 spin_lock(&c->journal.lock);
677
678 w = c->journal.cur;
679 w->need_write = true;
680
681 if (cl)
682 BUG_ON(!closure_wait(&w->wait, cl));
683
684 __journal_try_write(c, true);
685 }
686}
687
688/*
689 * Entry point to the journalling code - bio_insert() and btree_invalidate()
690 * pass bch_journal() a list of keys to be journalled, and then
691 * bch_journal() hands those same keys off to btree_insert_async()
692 */
693
694void bch_journal(struct closure *cl)
695{
696 struct btree_op *op = container_of(cl, struct btree_op, cl);
697 struct cache_set *c = op->c;
698 struct journal_write *w;
699 size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list;
700
701 if (op->type != BTREE_INSERT ||
702 !CACHE_SYNC(&c->sb))
703 goto out;
704
705 /*
706 * If we're looping because we errored, might already be waiting on
707 * another journal write:
708 */
709 while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING)
710 closure_sync(cl->parent);
711
712 spin_lock(&c->journal.lock);
713
714 if (journal_full(&c->journal)) {
715 /* XXX: tracepoint */
716 closure_wait(&c->journal.wait, cl);
717
718 journal_reclaim(c);
719 spin_unlock(&c->journal.lock);
720
721 btree_flush_write(c);
722 continue_at(cl, bch_journal, bcache_wq);
723 }
724
725 w = c->journal.cur;
726 w->need_write = true;
727 b = __set_blocks(w->data, w->data->keys + n, c);
728
729 if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
730 b > c->journal.blocks_free) {
731 /* XXX: If we were inserting so many keys that they won't fit in
732 * an _empty_ journal write, we'll deadlock. For now, handle
733 * this in bch_keylist_realloc() - but something to think about.
734 */
735 BUG_ON(!w->data->keys);
736
737 /* XXX: tracepoint */
738 BUG_ON(!closure_wait(&w->wait, cl));
739
740 closure_flush(&c->journal.io);
741
742 journal_try_write(c);
743 continue_at(cl, bch_journal, bcache_wq);
744 }
745
746 memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t));
747 w->data->keys += n;
748
749 op->journal = &fifo_back(&c->journal.pin);
750 atomic_inc(op->journal);
751
752 if (op->flush_journal) {
753 closure_flush(&c->journal.io);
754 closure_wait(&w->wait, cl->parent);
755 }
756
757 journal_try_write(c);
758out:
759 bch_btree_insert_async(cl);
760}
761
762void bch_journal_free(struct cache_set *c)
763{
764 free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
765 free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
766 free_fifo(&c->journal.pin);
767}
768
769int bch_journal_alloc(struct cache_set *c)
770{
771 struct journal *j = &c->journal;
772
773 closure_init_unlocked(&j->io);
774 spin_lock_init(&j->lock);
775
776 c->journal_delay_ms = 100;
777
778 j->w[0].c = c;
779 j->w[1].c = c;
780
781 if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
782 !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
783 !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
784 return -ENOMEM;
785
786 return 0;
787}
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
new file mode 100644
index 000000000000..3d7851274b04
--- /dev/null
+++ b/drivers/md/bcache/journal.h
@@ -0,0 +1,215 @@
1#ifndef _BCACHE_JOURNAL_H
2#define _BCACHE_JOURNAL_H
3
4/*
5 * THE JOURNAL:
6 *
7 * The journal is treated as a circular buffer of buckets - a journal entry
8 * never spans two buckets. This means (not implemented yet) we can resize the
9 * journal at runtime, and will be needed for bcache on raw flash support.
10 *
11 * Journal entries contain a list of keys, ordered by the time they were
12 * inserted; thus journal replay just has to reinsert the keys.
13 *
14 * We also keep some things in the journal header that are logically part of the
15 * superblock - all the things that are frequently updated. This is for future
16 * bcache on raw flash support; the superblock (which will become another
17 * journal) can't be moved or wear leveled, so it contains just enough
18 * information to find the main journal, and the superblock only has to be
19 * rewritten when we want to move/wear level the main journal.
20 *
21 * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be
22 * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions
23 * from cache misses, which don't have to be journaled, and for writeback and
24 * moving gc we work around it by flushing the btree to disk before updating the
25 * gc information. But it is a potential issue with incremental garbage
26 * collection, and it's fragile.
27 *
28 * OPEN JOURNAL ENTRIES:
29 *
30 * Each journal entry contains, in the header, the sequence number of the last
31 * journal entry still open - i.e. that has keys that haven't been flushed to
32 * disk in the btree.
33 *
34 * We track this by maintaining a refcount for every open journal entry, in a
35 * fifo; each entry in the fifo corresponds to a particular journal
36 * entry/sequence number. When the refcount at the tail of the fifo goes to
37 * zero, we pop it off - thus, the size of the fifo tells us the number of open
38 * journal entries
39 *
40 * We take a refcount on a journal entry when we add some keys to a journal
41 * entry that we're going to insert (held by struct btree_op), and then when we
42 * insert those keys into the btree the btree write we're setting up takes a
43 * copy of that refcount (held by struct btree_write). That refcount is dropped
44 * when the btree write completes.
45 *
46 * A struct btree_write can only hold a refcount on a single journal entry, but
47 * might contain keys for many journal entries - we handle this by making sure
48 * it always has a refcount on the _oldest_ journal entry of all the journal
49 * entries it has keys for.
50 *
51 * JOURNAL RECLAIM:
52 *
53 * As mentioned previously, our fifo of refcounts tells us the number of open
54 * journal entries; from that and the current journal sequence number we compute
55 * last_seq - the oldest journal entry we still need. We write last_seq in each
56 * journal entry, and we also have to keep track of where it exists on disk so
57 * we don't overwrite it when we loop around the journal.
58 *
59 * To do that we track, for each journal bucket, the sequence number of the
60 * newest journal entry it contains - if we don't need that journal entry we
61 * don't need anything in that bucket anymore. From that we track the last
62 * journal bucket we still need; all this is tracked in struct journal_device
63 * and updated by journal_reclaim().
64 *
65 * JOURNAL FILLING UP:
66 *
67 * There are two ways the journal could fill up; either we could run out of
68 * space to write to, or we could have too many open journal entries and run out
69 * of room in the fifo of refcounts. Since those refcounts are decremented
70 * without any locking we can't safely resize that fifo, so we handle it the
71 * same way.
72 *
73 * If the journal fills up, we start flushing dirty btree nodes until we can
74 * allocate space for a journal write again - preferentially flushing btree
75 * nodes that are pinning the oldest journal entries first.
76 */
77
78#define BCACHE_JSET_VERSION_UUIDv1 1
79/* Always latest UUID format */
80#define BCACHE_JSET_VERSION_UUID 1
81#define BCACHE_JSET_VERSION 1
82
83/*
84 * On disk format for a journal entry:
85 * seq is monotonically increasing; every journal entry has its own unique
86 * sequence number.
87 *
88 * last_seq is the oldest journal entry that still has keys the btree hasn't
89 * flushed to disk yet.
90 *
91 * version is for on disk format changes.
92 */
93struct jset {
94 uint64_t csum;
95 uint64_t magic;
96 uint64_t seq;
97 uint32_t version;
98 uint32_t keys;
99
100 uint64_t last_seq;
101
102 BKEY_PADDED(uuid_bucket);
103 BKEY_PADDED(btree_root);
104 uint16_t btree_level;
105 uint16_t pad[3];
106
107 uint64_t prio_bucket[MAX_CACHES_PER_SET];
108
109 union {
110 struct bkey start[0];
111 uint64_t d[0];
112 };
113};
114
115/*
116 * Only used for holding the journal entries we read in btree_journal_read()
117 * during cache_registration
118 */
119struct journal_replay {
120 struct list_head list;
121 atomic_t *pin;
122 struct jset j;
123};
124
125/*
126 * We put two of these in struct journal; we used them for writes to the
127 * journal that are being staged or in flight.
128 */
129struct journal_write {
130 struct jset *data;
131#define JSET_BITS 3
132
133 struct cache_set *c;
134 struct closure_waitlist wait;
135 bool need_write;
136};
137
138/* Embedded in struct cache_set */
139struct journal {
140 spinlock_t lock;
141 /* used when waiting because the journal was full */
142 struct closure_waitlist wait;
143 struct closure_with_timer io;
144
145 /* Number of blocks free in the bucket(s) we're currently writing to */
146 unsigned blocks_free;
147 uint64_t seq;
148 DECLARE_FIFO(atomic_t, pin);
149
150 BKEY_PADDED(key);
151
152 struct journal_write w[2], *cur;
153};
154
155/*
156 * Embedded in struct cache. First three fields refer to the array of journal
157 * buckets, in cache_sb.
158 */
159struct journal_device {
160 /*
161 * For each journal bucket, contains the max sequence number of the
162 * journal writes it contains - so we know when a bucket can be reused.
163 */
164 uint64_t seq[SB_JOURNAL_BUCKETS];
165
166 /* Journal bucket we're currently writing to */
167 unsigned cur_idx;
168
169 /* Last journal bucket that still contains an open journal entry */
170 unsigned last_idx;
171
172 /* Next journal bucket to be discarded */
173 unsigned discard_idx;
174
175#define DISCARD_READY 0
176#define DISCARD_IN_FLIGHT 1
177#define DISCARD_DONE 2
178 /* 1 - discard in flight, -1 - discard completed */
179 atomic_t discard_in_flight;
180
181 struct work_struct discard_work;
182 struct bio discard_bio;
183 struct bio_vec discard_bv;
184
185 /* Bio for journal reads/writes to this device */
186 struct bio bio;
187 struct bio_vec bv[8];
188};
189
190#define journal_pin_cmp(c, l, r) \
191 (fifo_idx(&(c)->journal.pin, (l)->journal) > \
192 fifo_idx(&(c)->journal.pin, (r)->journal))
193
194#define JOURNAL_PIN 20000
195
196#define journal_full(j) \
197 (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1)
198
199struct closure;
200struct cache_set;
201struct btree_op;
202
203void bch_journal(struct closure *);
204void bch_journal_next(struct journal *);
205void bch_journal_mark(struct cache_set *, struct list_head *);
206void bch_journal_meta(struct cache_set *, struct closure *);
207int bch_journal_read(struct cache_set *, struct list_head *,
208 struct btree_op *);
209int bch_journal_replay(struct cache_set *, struct list_head *,
210 struct btree_op *);
211
212void bch_journal_free(struct cache_set *);
213int bch_journal_alloc(struct cache_set *);
214
215#endif /* _BCACHE_JOURNAL_H */
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
new file mode 100644
index 000000000000..8589512c972e
--- /dev/null
+++ b/drivers/md/bcache/movinggc.c
@@ -0,0 +1,254 @@
1/*
2 * Moving/copying garbage collector
3 *
4 * Copyright 2012 Google, Inc.
5 */
6
7#include "bcache.h"
8#include "btree.h"
9#include "debug.h"
10#include "request.h"
11
12struct moving_io {
13 struct keybuf_key *w;
14 struct search s;
15 struct bbio bio;
16};
17
18static bool moving_pred(struct keybuf *buf, struct bkey *k)
19{
20 struct cache_set *c = container_of(buf, struct cache_set,
21 moving_gc_keys);
22 unsigned i;
23
24 for (i = 0; i < KEY_PTRS(k); i++) {
25 struct cache *ca = PTR_CACHE(c, k, i);
26 struct bucket *g = PTR_BUCKET(c, k, i);
27
28 if (GC_SECTORS_USED(g) < ca->gc_move_threshold)
29 return true;
30 }
31
32 return false;
33}
34
35/* Moving GC - IO loop */
36
37static void moving_io_destructor(struct closure *cl)
38{
39 struct moving_io *io = container_of(cl, struct moving_io, s.cl);
40 kfree(io);
41}
42
43static void write_moving_finish(struct closure *cl)
44{
45 struct moving_io *io = container_of(cl, struct moving_io, s.cl);
46 struct bio *bio = &io->bio.bio;
47 struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt);
48
49 while (bv-- != bio->bi_io_vec)
50 __free_page(bv->bv_page);
51
52 pr_debug("%s %s", io->s.op.insert_collision
53 ? "collision moving" : "moved",
54 pkey(&io->w->key));
55
56 bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
57
58 atomic_dec_bug(&io->s.op.c->in_flight);
59 closure_wake_up(&io->s.op.c->moving_gc_wait);
60
61 closure_return_with_destructor(cl, moving_io_destructor);
62}
63
64static void read_moving_endio(struct bio *bio, int error)
65{
66 struct moving_io *io = container_of(bio->bi_private,
67 struct moving_io, s.cl);
68
69 if (error)
70 io->s.error = error;
71
72 bch_bbio_endio(io->s.op.c, bio, error, "reading data to move");
73}
74
75static void moving_init(struct moving_io *io)
76{
77 struct bio *bio = &io->bio.bio;
78
79 bio_init(bio);
80 bio_get(bio);
81 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
82
83 bio->bi_size = KEY_SIZE(&io->w->key) << 9;
84 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key),
85 PAGE_SECTORS);
86 bio->bi_private = &io->s.cl;
87 bio->bi_io_vec = bio->bi_inline_vecs;
88 bch_bio_map(bio, NULL);
89}
90
91static void write_moving(struct closure *cl)
92{
93 struct search *s = container_of(cl, struct search, cl);
94 struct moving_io *io = container_of(s, struct moving_io, s);
95
96 if (!s->error) {
97 trace_bcache_write_moving(&io->bio.bio);
98
99 moving_init(io);
100
101 io->bio.bio.bi_sector = KEY_START(&io->w->key);
102 s->op.lock = -1;
103 s->op.write_prio = 1;
104 s->op.cache_bio = &io->bio.bio;
105
106 s->writeback = KEY_DIRTY(&io->w->key);
107 s->op.csum = KEY_CSUM(&io->w->key);
108
109 s->op.type = BTREE_REPLACE;
110 bkey_copy(&s->op.replace, &io->w->key);
111
112 closure_init(&s->op.cl, cl);
113 bch_insert_data(&s->op.cl);
114 }
115
116 continue_at(cl, write_moving_finish, NULL);
117}
118
119static void read_moving_submit(struct closure *cl)
120{
121 struct search *s = container_of(cl, struct search, cl);
122 struct moving_io *io = container_of(s, struct moving_io, s);
123 struct bio *bio = &io->bio.bio;
124
125 trace_bcache_read_moving(bio);
126 bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
127
128 continue_at(cl, write_moving, bch_gc_wq);
129}
130
131static void read_moving(struct closure *cl)
132{
133 struct cache_set *c = container_of(cl, struct cache_set, moving_gc);
134 struct keybuf_key *w;
135 struct moving_io *io;
136 struct bio *bio;
137
138 /* XXX: if we error, background writeback could stall indefinitely */
139
140 while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
141 w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY);
142 if (!w)
143 break;
144
145 io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
146 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
147 GFP_KERNEL);
148 if (!io)
149 goto err;
150
151 w->private = io;
152 io->w = w;
153 io->s.op.inode = KEY_INODE(&w->key);
154 io->s.op.c = c;
155
156 moving_init(io);
157 bio = &io->bio.bio;
158
159 bio->bi_rw = READ;
160 bio->bi_end_io = read_moving_endio;
161
162 if (bch_bio_alloc_pages(bio, GFP_KERNEL))
163 goto err;
164
165 pr_debug("%s", pkey(&w->key));
166
167 closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl);
168
169 if (atomic_inc_return(&c->in_flight) >= 64) {
170 closure_wait_event(&c->moving_gc_wait, cl,
171 atomic_read(&c->in_flight) < 64);
172 continue_at(cl, read_moving, bch_gc_wq);
173 }
174 }
175
176 if (0) {
177err: if (!IS_ERR_OR_NULL(w->private))
178 kfree(w->private);
179
180 bch_keybuf_del(&c->moving_gc_keys, w);
181 }
182
183 closure_return(cl);
184}
185
186static bool bucket_cmp(struct bucket *l, struct bucket *r)
187{
188 return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
189}
190
191static unsigned bucket_heap_top(struct cache *ca)
192{
193 return GC_SECTORS_USED(heap_peek(&ca->heap));
194}
195
196void bch_moving_gc(struct closure *cl)
197{
198 struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
199 struct cache *ca;
200 struct bucket *b;
201 unsigned i;
202
203 if (!c->copy_gc_enabled)
204 closure_return(cl);
205
206 mutex_lock(&c->bucket_lock);
207
208 for_each_cache(ca, c, i) {
209 unsigned sectors_to_move = 0;
210 unsigned reserve_sectors = ca->sb.bucket_size *
211 min(fifo_used(&ca->free), ca->free.size / 2);
212
213 ca->heap.used = 0;
214
215 for_each_bucket(b, ca) {
216 if (!GC_SECTORS_USED(b))
217 continue;
218
219 if (!heap_full(&ca->heap)) {
220 sectors_to_move += GC_SECTORS_USED(b);
221 heap_add(&ca->heap, b, bucket_cmp);
222 } else if (bucket_cmp(b, heap_peek(&ca->heap))) {
223 sectors_to_move -= bucket_heap_top(ca);
224 sectors_to_move += GC_SECTORS_USED(b);
225
226 ca->heap.data[0] = b;
227 heap_sift(&ca->heap, 0, bucket_cmp);
228 }
229 }
230
231 while (sectors_to_move > reserve_sectors) {
232 heap_pop(&ca->heap, b, bucket_cmp);
233 sectors_to_move -= GC_SECTORS_USED(b);
234 }
235
236 ca->gc_move_threshold = bucket_heap_top(ca);
237
238 pr_debug("threshold %u", ca->gc_move_threshold);
239 }
240
241 mutex_unlock(&c->bucket_lock);
242
243 c->moving_gc_keys.last_scanned = ZERO_KEY;
244
245 closure_init(&c->moving_gc, cl);
246 read_moving(&c->moving_gc);
247
248 closure_return(cl);
249}
250
251void bch_moving_init_cache_set(struct cache_set *c)
252{
253 bch_keybuf_init(&c->moving_gc_keys, moving_pred);
254}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
new file mode 100644
index 000000000000..e5ff12e52d5b
--- /dev/null
+++ b/drivers/md/bcache/request.c
@@ -0,0 +1,1411 @@
1/*
2 * Main bcache entry point - handle a read or a write request and decide what to
3 * do with it; the make_request functions are called by the block layer.
4 *
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
7 */
8
9#include "bcache.h"
10#include "btree.h"
11#include "debug.h"
12#include "request.h"
13
14#include <linux/cgroup.h>
15#include <linux/module.h>
16#include <linux/hash.h>
17#include <linux/random.h>
18#include "blk-cgroup.h"
19
20#include <trace/events/bcache.h>
21
22#define CUTOFF_CACHE_ADD 95
23#define CUTOFF_CACHE_READA 90
24#define CUTOFF_WRITEBACK 50
25#define CUTOFF_WRITEBACK_SYNC 75
26
27struct kmem_cache *bch_search_cache;
28
29static void check_should_skip(struct cached_dev *, struct search *);
30
31/* Cgroup interface */
32
33#ifdef CONFIG_CGROUP_BCACHE
34static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 };
35
36static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup)
37{
38 struct cgroup_subsys_state *css;
39 return cgroup &&
40 (css = cgroup_subsys_state(cgroup, bcache_subsys_id))
41 ? container_of(css, struct bch_cgroup, css)
42 : &bcache_default_cgroup;
43}
44
45struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio)
46{
47 struct cgroup_subsys_state *css = bio->bi_css
48 ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id)
49 : task_subsys_state(current, bcache_subsys_id);
50
51 return css
52 ? container_of(css, struct bch_cgroup, css)
53 : &bcache_default_cgroup;
54}
55
56static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft,
57 struct file *file,
58 char __user *buf, size_t nbytes, loff_t *ppos)
59{
60 char tmp[1024];
61 int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes,
62 cgroup_to_bcache(cgrp)->cache_mode + 1);
63
64 if (len < 0)
65 return len;
66
67 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
68}
69
70static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft,
71 const char *buf)
72{
73 int v = bch_read_string_list(buf, bch_cache_modes);
74 if (v < 0)
75 return v;
76
77 cgroup_to_bcache(cgrp)->cache_mode = v - 1;
78 return 0;
79}
80
81static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft)
82{
83 return cgroup_to_bcache(cgrp)->verify;
84}
85
86static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
87{
88 cgroup_to_bcache(cgrp)->verify = val;
89 return 0;
90}
91
92static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft)
93{
94 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
95 return atomic_read(&bcachecg->stats.cache_hits);
96}
97
98static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft)
99{
100 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
101 return atomic_read(&bcachecg->stats.cache_misses);
102}
103
104static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp,
105 struct cftype *cft)
106{
107 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
108 return atomic_read(&bcachecg->stats.cache_bypass_hits);
109}
110
111static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp,
112 struct cftype *cft)
113{
114 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
115 return atomic_read(&bcachecg->stats.cache_bypass_misses);
116}
117
118static struct cftype bch_files[] = {
119 {
120 .name = "cache_mode",
121 .read = cache_mode_read,
122 .write_string = cache_mode_write,
123 },
124 {
125 .name = "verify",
126 .read_u64 = bch_verify_read,
127 .write_u64 = bch_verify_write,
128 },
129 {
130 .name = "cache_hits",
131 .read_u64 = bch_cache_hits_read,
132 },
133 {
134 .name = "cache_misses",
135 .read_u64 = bch_cache_misses_read,
136 },
137 {
138 .name = "cache_bypass_hits",
139 .read_u64 = bch_cache_bypass_hits_read,
140 },
141 {
142 .name = "cache_bypass_misses",
143 .read_u64 = bch_cache_bypass_misses_read,
144 },
145 { } /* terminate */
146};
147
148static void init_bch_cgroup(struct bch_cgroup *cg)
149{
150 cg->cache_mode = -1;
151}
152
153static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
154{
155 struct bch_cgroup *cg;
156
157 cg = kzalloc(sizeof(*cg), GFP_KERNEL);
158 if (!cg)
159 return ERR_PTR(-ENOMEM);
160 init_bch_cgroup(cg);
161 return &cg->css;
162}
163
164static void bcachecg_destroy(struct cgroup *cgroup)
165{
166 struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
167 free_css_id(&bcache_subsys, &cg->css);
168 kfree(cg);
169}
170
171struct cgroup_subsys bcache_subsys = {
172 .create = bcachecg_create,
173 .destroy = bcachecg_destroy,
174 .subsys_id = bcache_subsys_id,
175 .name = "bcache",
176 .module = THIS_MODULE,
177};
178EXPORT_SYMBOL_GPL(bcache_subsys);
179#endif
180
181static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
182{
183#ifdef CONFIG_CGROUP_BCACHE
184 int r = bch_bio_to_cgroup(bio)->cache_mode;
185 if (r >= 0)
186 return r;
187#endif
188 return BDEV_CACHE_MODE(&dc->sb);
189}
190
191static bool verify(struct cached_dev *dc, struct bio *bio)
192{
193#ifdef CONFIG_CGROUP_BCACHE
194 if (bch_bio_to_cgroup(bio)->verify)
195 return true;
196#endif
197 return dc->verify;
198}
199
200static void bio_csum(struct bio *bio, struct bkey *k)
201{
202 struct bio_vec *bv;
203 uint64_t csum = 0;
204 int i;
205
206 bio_for_each_segment(bv, bio, i) {
207 void *d = kmap(bv->bv_page) + bv->bv_offset;
208 csum = bch_crc64_update(csum, d, bv->bv_len);
209 kunmap(bv->bv_page);
210 }
211
212 k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
213}
214
215/* Insert data into cache */
216
217static void bio_invalidate(struct closure *cl)
218{
219 struct btree_op *op = container_of(cl, struct btree_op, cl);
220 struct bio *bio = op->cache_bio;
221
222 pr_debug("invalidating %i sectors from %llu",
223 bio_sectors(bio), (uint64_t) bio->bi_sector);
224
225 while (bio_sectors(bio)) {
226 unsigned len = min(bio_sectors(bio), 1U << 14);
227
228 if (bch_keylist_realloc(&op->keys, 0, op->c))
229 goto out;
230
231 bio->bi_sector += len;
232 bio->bi_size -= len << 9;
233
234 bch_keylist_add(&op->keys,
235 &KEY(op->inode, bio->bi_sector, len));
236 }
237
238 op->insert_data_done = true;
239 bio_put(bio);
240out:
241 continue_at(cl, bch_journal, bcache_wq);
242}
243
244struct open_bucket {
245 struct list_head list;
246 struct task_struct *last;
247 unsigned sectors_free;
248 BKEY_PADDED(key);
249};
250
251void bch_open_buckets_free(struct cache_set *c)
252{
253 struct open_bucket *b;
254
255 while (!list_empty(&c->data_buckets)) {
256 b = list_first_entry(&c->data_buckets,
257 struct open_bucket, list);
258 list_del(&b->list);
259 kfree(b);
260 }
261}
262
263int bch_open_buckets_alloc(struct cache_set *c)
264{
265 int i;
266
267 spin_lock_init(&c->data_bucket_lock);
268
269 for (i = 0; i < 6; i++) {
270 struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
271 if (!b)
272 return -ENOMEM;
273
274 list_add(&b->list, &c->data_buckets);
275 }
276
277 return 0;
278}
279
280/*
281 * We keep multiple buckets open for writes, and try to segregate different
282 * write streams for better cache utilization: first we look for a bucket where
283 * the last write to it was sequential with the current write, and failing that
284 * we look for a bucket that was last used by the same task.
285 *
286 * The ideas is if you've got multiple tasks pulling data into the cache at the
287 * same time, you'll get better cache utilization if you try to segregate their
288 * data and preserve locality.
289 *
290 * For example, say you've starting Firefox at the same time you're copying a
291 * bunch of files. Firefox will likely end up being fairly hot and stay in the
292 * cache awhile, but the data you copied might not be; if you wrote all that
293 * data to the same buckets it'd get invalidated at the same time.
294 *
295 * Both of those tasks will be doing fairly random IO so we can't rely on
296 * detecting sequential IO to segregate their data, but going off of the task
297 * should be a sane heuristic.
298 */
299static struct open_bucket *pick_data_bucket(struct cache_set *c,
300 const struct bkey *search,
301 struct task_struct *task,
302 struct bkey *alloc)
303{
304 struct open_bucket *ret, *ret_task = NULL;
305
306 list_for_each_entry_reverse(ret, &c->data_buckets, list)
307 if (!bkey_cmp(&ret->key, search))
308 goto found;
309 else if (ret->last == task)
310 ret_task = ret;
311
312 ret = ret_task ?: list_first_entry(&c->data_buckets,
313 struct open_bucket, list);
314found:
315 if (!ret->sectors_free && KEY_PTRS(alloc)) {
316 ret->sectors_free = c->sb.bucket_size;
317 bkey_copy(&ret->key, alloc);
318 bkey_init(alloc);
319 }
320
321 if (!ret->sectors_free)
322 ret = NULL;
323
324 return ret;
325}
326
327/*
328 * Allocates some space in the cache to write to, and k to point to the newly
329 * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
330 * end of the newly allocated space).
331 *
332 * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
333 * sectors were actually allocated.
334 *
335 * If s->writeback is true, will not fail.
336 */
337static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
338 struct search *s)
339{
340 struct cache_set *c = s->op.c;
341 struct open_bucket *b;
342 BKEY_PADDED(key) alloc;
343 struct closure cl, *w = NULL;
344 unsigned i;
345
346 if (s->writeback) {
347 closure_init_stack(&cl);
348 w = &cl;
349 }
350
351 /*
352 * We might have to allocate a new bucket, which we can't do with a
353 * spinlock held. So if we have to allocate, we drop the lock, allocate
354 * and then retry. KEY_PTRS() indicates whether alloc points to
355 * allocated bucket(s).
356 */
357
358 bkey_init(&alloc.key);
359 spin_lock(&c->data_bucket_lock);
360
361 while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) {
362 unsigned watermark = s->op.write_prio
363 ? WATERMARK_MOVINGGC
364 : WATERMARK_NONE;
365
366 spin_unlock(&c->data_bucket_lock);
367
368 if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w))
369 return false;
370
371 spin_lock(&c->data_bucket_lock);
372 }
373
374 /*
375 * If we had to allocate, we might race and not need to allocate the
376 * second time we call find_data_bucket(). If we allocated a bucket but
377 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
378 */
379 if (KEY_PTRS(&alloc.key))
380 __bkey_put(c, &alloc.key);
381
382 for (i = 0; i < KEY_PTRS(&b->key); i++)
383 EBUG_ON(ptr_stale(c, &b->key, i));
384
385 /* Set up the pointer to the space we're allocating: */
386
387 for (i = 0; i < KEY_PTRS(&b->key); i++)
388 k->ptr[i] = b->key.ptr[i];
389
390 sectors = min(sectors, b->sectors_free);
391
392 SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
393 SET_KEY_SIZE(k, sectors);
394 SET_KEY_PTRS(k, KEY_PTRS(&b->key));
395
396 /*
397 * Move b to the end of the lru, and keep track of what this bucket was
398 * last used for:
399 */
400 list_move_tail(&b->list, &c->data_buckets);
401 bkey_copy_key(&b->key, k);
402 b->last = s->task;
403
404 b->sectors_free -= sectors;
405
406 for (i = 0; i < KEY_PTRS(&b->key); i++) {
407 SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
408
409 atomic_long_add(sectors,
410 &PTR_CACHE(c, &b->key, i)->sectors_written);
411 }
412
413 if (b->sectors_free < c->sb.block_size)
414 b->sectors_free = 0;
415
416 /*
417 * k takes refcounts on the buckets it points to until it's inserted
418 * into the btree, but if we're done with this bucket we just transfer
419 * get_data_bucket()'s refcount.
420 */
421 if (b->sectors_free)
422 for (i = 0; i < KEY_PTRS(&b->key); i++)
423 atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
424
425 spin_unlock(&c->data_bucket_lock);
426 return true;
427}
428
429static void bch_insert_data_error(struct closure *cl)
430{
431 struct btree_op *op = container_of(cl, struct btree_op, cl);
432
433 /*
434 * Our data write just errored, which means we've got a bunch of keys to
435 * insert that point to data that wasn't succesfully written.
436 *
437 * We don't have to insert those keys but we still have to invalidate
438 * that region of the cache - so, if we just strip off all the pointers
439 * from the keys we'll accomplish just that.
440 */
441
442 struct bkey *src = op->keys.bottom, *dst = op->keys.bottom;
443
444 while (src != op->keys.top) {
445 struct bkey *n = bkey_next(src);
446
447 SET_KEY_PTRS(src, 0);
448 bkey_copy(dst, src);
449
450 dst = bkey_next(dst);
451 src = n;
452 }
453
454 op->keys.top = dst;
455
456 bch_journal(cl);
457}
458
459static void bch_insert_data_endio(struct bio *bio, int error)
460{
461 struct closure *cl = bio->bi_private;
462 struct btree_op *op = container_of(cl, struct btree_op, cl);
463 struct search *s = container_of(op, struct search, op);
464
465 if (error) {
466 /* TODO: We could try to recover from this. */
467 if (s->writeback)
468 s->error = error;
469 else if (s->write)
470 set_closure_fn(cl, bch_insert_data_error, bcache_wq);
471 else
472 set_closure_fn(cl, NULL, NULL);
473 }
474
475 bch_bbio_endio(op->c, bio, error, "writing data to cache");
476}
477
478static void bch_insert_data_loop(struct closure *cl)
479{
480 struct btree_op *op = container_of(cl, struct btree_op, cl);
481 struct search *s = container_of(op, struct search, op);
482 struct bio *bio = op->cache_bio, *n;
483
484 if (op->skip)
485 return bio_invalidate(cl);
486
487 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
488 set_gc_sectors(op->c);
489 bch_queue_gc(op->c);
490 }
491
492 do {
493 unsigned i;
494 struct bkey *k;
495 struct bio_set *split = s->d
496 ? s->d->bio_split : op->c->bio_split;
497
498 /* 1 for the device pointer and 1 for the chksum */
499 if (bch_keylist_realloc(&op->keys,
500 1 + (op->csum ? 1 : 0),
501 op->c))
502 continue_at(cl, bch_journal, bcache_wq);
503
504 k = op->keys.top;
505 bkey_init(k);
506 SET_KEY_INODE(k, op->inode);
507 SET_KEY_OFFSET(k, bio->bi_sector);
508
509 if (!bch_alloc_sectors(k, bio_sectors(bio), s))
510 goto err;
511
512 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
513 if (!n) {
514 __bkey_put(op->c, k);
515 continue_at(cl, bch_insert_data_loop, bcache_wq);
516 }
517
518 n->bi_end_io = bch_insert_data_endio;
519 n->bi_private = cl;
520
521 if (s->writeback) {
522 SET_KEY_DIRTY(k, true);
523
524 for (i = 0; i < KEY_PTRS(k); i++)
525 SET_GC_MARK(PTR_BUCKET(op->c, k, i),
526 GC_MARK_DIRTY);
527 }
528
529 SET_KEY_CSUM(k, op->csum);
530 if (KEY_CSUM(k))
531 bio_csum(n, k);
532
533 pr_debug("%s", pkey(k));
534 bch_keylist_push(&op->keys);
535
536 trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev);
537 n->bi_rw |= REQ_WRITE;
538 bch_submit_bbio(n, op->c, k, 0);
539 } while (n != bio);
540
541 op->insert_data_done = true;
542 continue_at(cl, bch_journal, bcache_wq);
543err:
544 /* bch_alloc_sectors() blocks if s->writeback = true */
545 BUG_ON(s->writeback);
546
547 /*
548 * But if it's not a writeback write we'd rather just bail out if
549 * there aren't any buckets ready to write to - it might take awhile and
550 * we might be starving btree writes for gc or something.
551 */
552
553 if (s->write) {
554 /*
555 * Writethrough write: We can't complete the write until we've
556 * updated the index. But we don't want to delay the write while
557 * we wait for buckets to be freed up, so just invalidate the
558 * rest of the write.
559 */
560 op->skip = true;
561 return bio_invalidate(cl);
562 } else {
563 /*
564 * From a cache miss, we can just insert the keys for the data
565 * we have written or bail out if we didn't do anything.
566 */
567 op->insert_data_done = true;
568 bio_put(bio);
569
570 if (!bch_keylist_empty(&op->keys))
571 continue_at(cl, bch_journal, bcache_wq);
572 else
573 closure_return(cl);
574 }
575}
576
577/**
578 * bch_insert_data - stick some data in the cache
579 *
580 * This is the starting point for any data to end up in a cache device; it could
581 * be from a normal write, or a writeback write, or a write to a flash only
582 * volume - it's also used by the moving garbage collector to compact data in
583 * mostly empty buckets.
584 *
585 * It first writes the data to the cache, creating a list of keys to be inserted
586 * (if the data had to be fragmented there will be multiple keys); after the
587 * data is written it calls bch_journal, and after the keys have been added to
588 * the next journal write they're inserted into the btree.
589 *
590 * It inserts the data in op->cache_bio; bi_sector is used for the key offset,
591 * and op->inode is used for the key inode.
592 *
593 * If op->skip is true, instead of inserting the data it invalidates the region
594 * of the cache represented by op->cache_bio and op->inode.
595 */
596void bch_insert_data(struct closure *cl)
597{
598 struct btree_op *op = container_of(cl, struct btree_op, cl);
599
600 bch_keylist_init(&op->keys);
601 bio_get(op->cache_bio);
602 bch_insert_data_loop(cl);
603}
604
605void bch_btree_insert_async(struct closure *cl)
606{
607 struct btree_op *op = container_of(cl, struct btree_op, cl);
608 struct search *s = container_of(op, struct search, op);
609
610 if (bch_btree_insert(op, op->c)) {
611 s->error = -ENOMEM;
612 op->insert_data_done = true;
613 }
614
615 if (op->insert_data_done) {
616 bch_keylist_free(&op->keys);
617 closure_return(cl);
618 } else
619 continue_at(cl, bch_insert_data_loop, bcache_wq);
620}
621
622/* Common code for the make_request functions */
623
624static void request_endio(struct bio *bio, int error)
625{
626 struct closure *cl = bio->bi_private;
627
628 if (error) {
629 struct search *s = container_of(cl, struct search, cl);
630 s->error = error;
631 /* Only cache read errors are recoverable */
632 s->recoverable = false;
633 }
634
635 bio_put(bio);
636 closure_put(cl);
637}
638
639void bch_cache_read_endio(struct bio *bio, int error)
640{
641 struct bbio *b = container_of(bio, struct bbio, bio);
642 struct closure *cl = bio->bi_private;
643 struct search *s = container_of(cl, struct search, cl);
644
645 /*
646 * If the bucket was reused while our bio was in flight, we might have
647 * read the wrong data. Set s->error but not error so it doesn't get
648 * counted against the cache device, but we'll still reread the data
649 * from the backing device.
650 */
651
652 if (error)
653 s->error = error;
654 else if (ptr_stale(s->op.c, &b->key, 0)) {
655 atomic_long_inc(&s->op.c->cache_read_races);
656 s->error = -EINTR;
657 }
658
659 bch_bbio_endio(s->op.c, bio, error, "reading from cache");
660}
661
662static void bio_complete(struct search *s)
663{
664 if (s->orig_bio) {
665 int cpu, rw = bio_data_dir(s->orig_bio);
666 unsigned long duration = jiffies - s->start_time;
667
668 cpu = part_stat_lock();
669 part_round_stats(cpu, &s->d->disk->part0);
670 part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
671 part_stat_unlock();
672
673 trace_bcache_request_end(s, s->orig_bio);
674 bio_endio(s->orig_bio, s->error);
675 s->orig_bio = NULL;
676 }
677}
678
679static void do_bio_hook(struct search *s)
680{
681 struct bio *bio = &s->bio.bio;
682 memcpy(bio, s->orig_bio, sizeof(struct bio));
683
684 bio->bi_end_io = request_endio;
685 bio->bi_private = &s->cl;
686 atomic_set(&bio->bi_cnt, 3);
687}
688
689static void search_free(struct closure *cl)
690{
691 struct search *s = container_of(cl, struct search, cl);
692 bio_complete(s);
693
694 if (s->op.cache_bio)
695 bio_put(s->op.cache_bio);
696
697 if (s->unaligned_bvec)
698 mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
699
700 closure_debug_destroy(cl);
701 mempool_free(s, s->d->c->search);
702}
703
704static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
705{
706 struct bio_vec *bv;
707 struct search *s = mempool_alloc(d->c->search, GFP_NOIO);
708 memset(s, 0, offsetof(struct search, op.keys));
709
710 __closure_init(&s->cl, NULL);
711
712 s->op.inode = d->id;
713 s->op.c = d->c;
714 s->d = d;
715 s->op.lock = -1;
716 s->task = current;
717 s->orig_bio = bio;
718 s->write = (bio->bi_rw & REQ_WRITE) != 0;
719 s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0;
720 s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0;
721 s->recoverable = 1;
722 s->start_time = jiffies;
723 do_bio_hook(s);
724
725 if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
726 bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
727 memcpy(bv, bio_iovec(bio),
728 sizeof(struct bio_vec) * bio_segments(bio));
729
730 s->bio.bio.bi_io_vec = bv;
731 s->unaligned_bvec = 1;
732 }
733
734 return s;
735}
736
737static void btree_read_async(struct closure *cl)
738{
739 struct btree_op *op = container_of(cl, struct btree_op, cl);
740
741 int ret = btree_root(search_recurse, op->c, op);
742
743 if (ret == -EAGAIN)
744 continue_at(cl, btree_read_async, bcache_wq);
745
746 closure_return(cl);
747}
748
749/* Cached devices */
750
751static void cached_dev_bio_complete(struct closure *cl)
752{
753 struct search *s = container_of(cl, struct search, cl);
754 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
755
756 search_free(cl);
757 cached_dev_put(dc);
758}
759
760/* Process reads */
761
762static void cached_dev_read_complete(struct closure *cl)
763{
764 struct search *s = container_of(cl, struct search, cl);
765
766 if (s->op.insert_collision)
767 bch_mark_cache_miss_collision(s);
768
769 if (s->op.cache_bio) {
770 int i;
771 struct bio_vec *bv;
772
773 __bio_for_each_segment(bv, s->op.cache_bio, i, 0)
774 __free_page(bv->bv_page);
775 }
776
777 cached_dev_bio_complete(cl);
778}
779
780static void request_read_error(struct closure *cl)
781{
782 struct search *s = container_of(cl, struct search, cl);
783 struct bio_vec *bv;
784 int i;
785
786 if (s->recoverable) {
787 /* The cache read failed, but we can retry from the backing
788 * device.
789 */
790 pr_debug("recovering at sector %llu",
791 (uint64_t) s->orig_bio->bi_sector);
792
793 s->error = 0;
794 bv = s->bio.bio.bi_io_vec;
795 do_bio_hook(s);
796 s->bio.bio.bi_io_vec = bv;
797
798 if (!s->unaligned_bvec)
799 bio_for_each_segment(bv, s->orig_bio, i)
800 bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
801 else
802 memcpy(s->bio.bio.bi_io_vec,
803 bio_iovec(s->orig_bio),
804 sizeof(struct bio_vec) *
805 bio_segments(s->orig_bio));
806
807 /* XXX: invalidate cache */
808
809 trace_bcache_read_retry(&s->bio.bio);
810 closure_bio_submit(&s->bio.bio, &s->cl, s->d);
811 }
812
813 continue_at(cl, cached_dev_read_complete, NULL);
814}
815
816static void request_read_done(struct closure *cl)
817{
818 struct search *s = container_of(cl, struct search, cl);
819 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
820
821 /*
822 * s->cache_bio != NULL implies that we had a cache miss; cache_bio now
823 * contains data ready to be inserted into the cache.
824 *
825 * First, we copy the data we just read from cache_bio's bounce buffers
826 * to the buffers the original bio pointed to:
827 */
828
829 if (s->op.cache_bio) {
830 struct bio_vec *src, *dst;
831 unsigned src_offset, dst_offset, bytes;
832 void *dst_ptr;
833
834 bio_reset(s->op.cache_bio);
835 s->op.cache_bio->bi_sector = s->cache_miss->bi_sector;
836 s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev;
837 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9;
838 bch_bio_map(s->op.cache_bio, NULL);
839
840 src = bio_iovec(s->op.cache_bio);
841 dst = bio_iovec(s->cache_miss);
842 src_offset = src->bv_offset;
843 dst_offset = dst->bv_offset;
844 dst_ptr = kmap(dst->bv_page);
845
846 while (1) {
847 if (dst_offset == dst->bv_offset + dst->bv_len) {
848 kunmap(dst->bv_page);
849 dst++;
850 if (dst == bio_iovec_idx(s->cache_miss,
851 s->cache_miss->bi_vcnt))
852 break;
853
854 dst_offset = dst->bv_offset;
855 dst_ptr = kmap(dst->bv_page);
856 }
857
858 if (src_offset == src->bv_offset + src->bv_len) {
859 src++;
860 if (src == bio_iovec_idx(s->op.cache_bio,
861 s->op.cache_bio->bi_vcnt))
862 BUG();
863
864 src_offset = src->bv_offset;
865 }
866
867 bytes = min(dst->bv_offset + dst->bv_len - dst_offset,
868 src->bv_offset + src->bv_len - src_offset);
869
870 memcpy(dst_ptr + dst_offset,
871 page_address(src->bv_page) + src_offset,
872 bytes);
873
874 src_offset += bytes;
875 dst_offset += bytes;
876 }
877
878 bio_put(s->cache_miss);
879 s->cache_miss = NULL;
880 }
881
882 if (verify(dc, &s->bio.bio) && s->recoverable)
883 bch_data_verify(s);
884
885 bio_complete(s);
886
887 if (s->op.cache_bio &&
888 !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) {
889 s->op.type = BTREE_REPLACE;
890 closure_call(&s->op.cl, bch_insert_data, NULL, cl);
891 }
892
893 continue_at(cl, cached_dev_read_complete, NULL);
894}
895
896static void request_read_done_bh(struct closure *cl)
897{
898 struct search *s = container_of(cl, struct search, cl);
899 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
900
901 bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip);
902
903 if (s->error)
904 continue_at_nobarrier(cl, request_read_error, bcache_wq);
905 else if (s->op.cache_bio || verify(dc, &s->bio.bio))
906 continue_at_nobarrier(cl, request_read_done, bcache_wq);
907 else
908 continue_at_nobarrier(cl, cached_dev_read_complete, NULL);
909}
910
911static int cached_dev_cache_miss(struct btree *b, struct search *s,
912 struct bio *bio, unsigned sectors)
913{
914 int ret = 0;
915 unsigned reada;
916 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
917 struct bio *miss;
918
919 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
920 if (!miss)
921 return -EAGAIN;
922
923 if (miss == bio)
924 s->op.lookup_done = true;
925
926 miss->bi_end_io = request_endio;
927 miss->bi_private = &s->cl;
928
929 if (s->cache_miss || s->op.skip)
930 goto out_submit;
931
932 if (miss != bio ||
933 (bio->bi_rw & REQ_RAHEAD) ||
934 (bio->bi_rw & REQ_META) ||
935 s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA)
936 reada = 0;
937 else {
938 reada = min(dc->readahead >> 9,
939 sectors - bio_sectors(miss));
940
941 if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev))
942 reada = bdev_sectors(miss->bi_bdev) - bio_end(miss);
943 }
944
945 s->cache_bio_sectors = bio_sectors(miss) + reada;
946 s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT,
947 DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
948 dc->disk.bio_split);
949
950 if (!s->op.cache_bio)
951 goto out_submit;
952
953 s->op.cache_bio->bi_sector = miss->bi_sector;
954 s->op.cache_bio->bi_bdev = miss->bi_bdev;
955 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9;
956
957 s->op.cache_bio->bi_end_io = request_endio;
958 s->op.cache_bio->bi_private = &s->cl;
959
960 /* btree_search_recurse()'s btree iterator is no good anymore */
961 ret = -EINTR;
962 if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio))
963 goto out_put;
964
965 bch_bio_map(s->op.cache_bio, NULL);
966 if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
967 goto out_put;
968
969 s->cache_miss = miss;
970 bio_get(s->op.cache_bio);
971
972 trace_bcache_cache_miss(s->orig_bio);
973 closure_bio_submit(s->op.cache_bio, &s->cl, s->d);
974
975 return ret;
976out_put:
977 bio_put(s->op.cache_bio);
978 s->op.cache_bio = NULL;
979out_submit:
980 closure_bio_submit(miss, &s->cl, s->d);
981 return ret;
982}
983
984static void request_read(struct cached_dev *dc, struct search *s)
985{
986 struct closure *cl = &s->cl;
987
988 check_should_skip(dc, s);
989 closure_call(&s->op.cl, btree_read_async, NULL, cl);
990
991 continue_at(cl, request_read_done_bh, NULL);
992}
993
994/* Process writes */
995
996static void cached_dev_write_complete(struct closure *cl)
997{
998 struct search *s = container_of(cl, struct search, cl);
999 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
1000
1001 up_read_non_owner(&dc->writeback_lock);
1002 cached_dev_bio_complete(cl);
1003}
1004
1005static bool should_writeback(struct cached_dev *dc, struct bio *bio)
1006{
1007 unsigned threshold = (bio->bi_rw & REQ_SYNC)
1008 ? CUTOFF_WRITEBACK_SYNC
1009 : CUTOFF_WRITEBACK;
1010
1011 return !atomic_read(&dc->disk.detaching) &&
1012 cache_mode(dc, bio) == CACHE_MODE_WRITEBACK &&
1013 dc->disk.c->gc_stats.in_use < threshold;
1014}
1015
1016static void request_write(struct cached_dev *dc, struct search *s)
1017{
1018 struct closure *cl = &s->cl;
1019 struct bio *bio = &s->bio.bio;
1020 struct bkey start, end;
1021 start = KEY(dc->disk.id, bio->bi_sector, 0);
1022 end = KEY(dc->disk.id, bio_end(bio), 0);
1023
1024 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
1025
1026 check_should_skip(dc, s);
1027 down_read_non_owner(&dc->writeback_lock);
1028
1029 if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
1030 s->op.skip = false;
1031 s->writeback = true;
1032 }
1033
1034 if (bio->bi_rw & REQ_DISCARD)
1035 goto skip;
1036
1037 if (s->op.skip)
1038 goto skip;
1039
1040 if (should_writeback(dc, s->orig_bio))
1041 s->writeback = true;
1042
1043 if (!s->writeback) {
1044 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
1045 dc->disk.bio_split);
1046
1047 trace_bcache_writethrough(s->orig_bio);
1048 closure_bio_submit(bio, cl, s->d);
1049 } else {
1050 s->op.cache_bio = bio;
1051 trace_bcache_writeback(s->orig_bio);
1052 bch_writeback_add(dc, bio_sectors(bio));
1053 }
1054out:
1055 closure_call(&s->op.cl, bch_insert_data, NULL, cl);
1056 continue_at(cl, cached_dev_write_complete, NULL);
1057skip:
1058 s->op.skip = true;
1059 s->op.cache_bio = s->orig_bio;
1060 bio_get(s->op.cache_bio);
1061 trace_bcache_write_skip(s->orig_bio);
1062
1063 if ((bio->bi_rw & REQ_DISCARD) &&
1064 !blk_queue_discard(bdev_get_queue(dc->bdev)))
1065 goto out;
1066
1067 closure_bio_submit(bio, cl, s->d);
1068 goto out;
1069}
1070
1071static void request_nodata(struct cached_dev *dc, struct search *s)
1072{
1073 struct closure *cl = &s->cl;
1074 struct bio *bio = &s->bio.bio;
1075
1076 if (bio->bi_rw & REQ_DISCARD) {
1077 request_write(dc, s);
1078 return;
1079 }
1080
1081 if (s->op.flush_journal)
1082 bch_journal_meta(s->op.c, cl);
1083
1084 closure_bio_submit(bio, cl, s->d);
1085
1086 continue_at(cl, cached_dev_bio_complete, NULL);
1087}
1088
1089/* Cached devices - read & write stuff */
1090
1091int bch_get_congested(struct cache_set *c)
1092{
1093 int i;
1094
1095 if (!c->congested_read_threshold_us &&
1096 !c->congested_write_threshold_us)
1097 return 0;
1098
1099 i = (local_clock_us() - c->congested_last_us) / 1024;
1100 if (i < 0)
1101 return 0;
1102
1103 i += atomic_read(&c->congested);
1104 if (i >= 0)
1105 return 0;
1106
1107 i += CONGESTED_MAX;
1108
1109 return i <= 0 ? 1 : fract_exp_two(i, 6);
1110}
1111
1112static void add_sequential(struct task_struct *t)
1113{
1114 ewma_add(t->sequential_io_avg,
1115 t->sequential_io, 8, 0);
1116
1117 t->sequential_io = 0;
1118}
1119
1120static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
1121{
1122 return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
1123}
1124
1125static void check_should_skip(struct cached_dev *dc, struct search *s)
1126{
1127 struct cache_set *c = s->op.c;
1128 struct bio *bio = &s->bio.bio;
1129
1130 long rand;
1131 int cutoff = bch_get_congested(c);
1132 unsigned mode = cache_mode(dc, bio);
1133
1134 if (atomic_read(&dc->disk.detaching) ||
1135 c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
1136 (bio->bi_rw & REQ_DISCARD))
1137 goto skip;
1138
1139 if (mode == CACHE_MODE_NONE ||
1140 (mode == CACHE_MODE_WRITEAROUND &&
1141 (bio->bi_rw & REQ_WRITE)))
1142 goto skip;
1143
1144 if (bio->bi_sector & (c->sb.block_size - 1) ||
1145 bio_sectors(bio) & (c->sb.block_size - 1)) {
1146 pr_debug("skipping unaligned io");
1147 goto skip;
1148 }
1149
1150 if (!cutoff) {
1151 cutoff = dc->sequential_cutoff >> 9;
1152
1153 if (!cutoff)
1154 goto rescale;
1155
1156 if (mode == CACHE_MODE_WRITEBACK &&
1157 (bio->bi_rw & REQ_WRITE) &&
1158 (bio->bi_rw & REQ_SYNC))
1159 goto rescale;
1160 }
1161
1162 if (dc->sequential_merge) {
1163 struct io *i;
1164
1165 spin_lock(&dc->io_lock);
1166
1167 hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
1168 if (i->last == bio->bi_sector &&
1169 time_before(jiffies, i->jiffies))
1170 goto found;
1171
1172 i = list_first_entry(&dc->io_lru, struct io, lru);
1173
1174 add_sequential(s->task);
1175 i->sequential = 0;
1176found:
1177 if (i->sequential + bio->bi_size > i->sequential)
1178 i->sequential += bio->bi_size;
1179
1180 i->last = bio_end(bio);
1181 i->jiffies = jiffies + msecs_to_jiffies(5000);
1182 s->task->sequential_io = i->sequential;
1183
1184 hlist_del(&i->hash);
1185 hlist_add_head(&i->hash, iohash(dc, i->last));
1186 list_move_tail(&i->lru, &dc->io_lru);
1187
1188 spin_unlock(&dc->io_lock);
1189 } else {
1190 s->task->sequential_io = bio->bi_size;
1191
1192 add_sequential(s->task);
1193 }
1194
1195 rand = get_random_int();
1196 cutoff -= bitmap_weight(&rand, BITS_PER_LONG);
1197
1198 if (cutoff <= (int) (max(s->task->sequential_io,
1199 s->task->sequential_io_avg) >> 9))
1200 goto skip;
1201
1202rescale:
1203 bch_rescale_priorities(c, bio_sectors(bio));
1204 return;
1205skip:
1206 bch_mark_sectors_bypassed(s, bio_sectors(bio));
1207 s->op.skip = true;
1208}
1209
1210static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
1211{
1212 struct search *s;
1213 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
1214 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1215 int cpu, rw = bio_data_dir(bio);
1216
1217 cpu = part_stat_lock();
1218 part_stat_inc(cpu, &d->disk->part0, ios[rw]);
1219 part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
1220 part_stat_unlock();
1221
1222 bio->bi_bdev = dc->bdev;
1223 bio->bi_sector += dc->sb.data_offset;
1224
1225 if (cached_dev_get(dc)) {
1226 s = search_alloc(bio, d);
1227 trace_bcache_request_start(s, bio);
1228
1229 if (!bio_has_data(bio))
1230 request_nodata(dc, s);
1231 else if (rw)
1232 request_write(dc, s);
1233 else
1234 request_read(dc, s);
1235 } else {
1236 if ((bio->bi_rw & REQ_DISCARD) &&
1237 !blk_queue_discard(bdev_get_queue(dc->bdev)))
1238 bio_endio(bio, 0);
1239 else
1240 bch_generic_make_request(bio, &d->bio_split_hook);
1241 }
1242}
1243
1244static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
1245 unsigned int cmd, unsigned long arg)
1246{
1247 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1248 return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
1249}
1250
1251static int cached_dev_congested(void *data, int bits)
1252{
1253 struct bcache_device *d = data;
1254 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1255 struct request_queue *q = bdev_get_queue(dc->bdev);
1256 int ret = 0;
1257
1258 if (bdi_congested(&q->backing_dev_info, bits))
1259 return 1;
1260
1261 if (cached_dev_get(dc)) {
1262 unsigned i;
1263 struct cache *ca;
1264
1265 for_each_cache(ca, d->c, i) {
1266 q = bdev_get_queue(ca->bdev);
1267 ret |= bdi_congested(&q->backing_dev_info, bits);
1268 }
1269
1270 cached_dev_put(dc);
1271 }
1272
1273 return ret;
1274}
1275
1276void bch_cached_dev_request_init(struct cached_dev *dc)
1277{
1278 struct gendisk *g = dc->disk.disk;
1279
1280 g->queue->make_request_fn = cached_dev_make_request;
1281 g->queue->backing_dev_info.congested_fn = cached_dev_congested;
1282 dc->disk.cache_miss = cached_dev_cache_miss;
1283 dc->disk.ioctl = cached_dev_ioctl;
1284}
1285
1286/* Flash backed devices */
1287
1288static int flash_dev_cache_miss(struct btree *b, struct search *s,
1289 struct bio *bio, unsigned sectors)
1290{
1291 /* Zero fill bio */
1292
1293 while (bio->bi_idx != bio->bi_vcnt) {
1294 struct bio_vec *bv = bio_iovec(bio);
1295 unsigned j = min(bv->bv_len >> 9, sectors);
1296
1297 void *p = kmap(bv->bv_page);
1298 memset(p + bv->bv_offset, 0, j << 9);
1299 kunmap(bv->bv_page);
1300
1301 bv->bv_len -= j << 9;
1302 bv->bv_offset += j << 9;
1303
1304 if (bv->bv_len)
1305 return 0;
1306
1307 bio->bi_sector += j;
1308 bio->bi_size -= j << 9;
1309
1310 bio->bi_idx++;
1311 sectors -= j;
1312 }
1313
1314 s->op.lookup_done = true;
1315
1316 return 0;
1317}
1318
1319static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
1320{
1321 struct search *s;
1322 struct closure *cl;
1323 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
1324 int cpu, rw = bio_data_dir(bio);
1325
1326 cpu = part_stat_lock();
1327 part_stat_inc(cpu, &d->disk->part0, ios[rw]);
1328 part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
1329 part_stat_unlock();
1330
1331 s = search_alloc(bio, d);
1332 cl = &s->cl;
1333 bio = &s->bio.bio;
1334
1335 trace_bcache_request_start(s, bio);
1336
1337 if (bio_has_data(bio) && !rw) {
1338 closure_call(&s->op.cl, btree_read_async, NULL, cl);
1339 } else if (bio_has_data(bio) || s->op.skip) {
1340 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
1341 &KEY(d->id, bio->bi_sector, 0),
1342 &KEY(d->id, bio_end(bio), 0));
1343
1344 s->writeback = true;
1345 s->op.cache_bio = bio;
1346
1347 closure_call(&s->op.cl, bch_insert_data, NULL, cl);
1348 } else {
1349 /* No data - probably a cache flush */
1350 if (s->op.flush_journal)
1351 bch_journal_meta(s->op.c, cl);
1352 }
1353
1354 continue_at(cl, search_free, NULL);
1355}
1356
1357static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
1358 unsigned int cmd, unsigned long arg)
1359{
1360 return -ENOTTY;
1361}
1362
1363static int flash_dev_congested(void *data, int bits)
1364{
1365 struct bcache_device *d = data;
1366 struct request_queue *q;
1367 struct cache *ca;
1368 unsigned i;
1369 int ret = 0;
1370
1371 for_each_cache(ca, d->c, i) {
1372 q = bdev_get_queue(ca->bdev);
1373 ret |= bdi_congested(&q->backing_dev_info, bits);
1374 }
1375
1376 return ret;
1377}
1378
1379void bch_flash_dev_request_init(struct bcache_device *d)
1380{
1381 struct gendisk *g = d->disk;
1382
1383 g->queue->make_request_fn = flash_dev_make_request;
1384 g->queue->backing_dev_info.congested_fn = flash_dev_congested;
1385 d->cache_miss = flash_dev_cache_miss;
1386 d->ioctl = flash_dev_ioctl;
1387}
1388
1389void bch_request_exit(void)
1390{
1391#ifdef CONFIG_CGROUP_BCACHE
1392 cgroup_unload_subsys(&bcache_subsys);
1393#endif
1394 if (bch_search_cache)
1395 kmem_cache_destroy(bch_search_cache);
1396}
1397
1398int __init bch_request_init(void)
1399{
1400 bch_search_cache = KMEM_CACHE(search, 0);
1401 if (!bch_search_cache)
1402 return -ENOMEM;
1403
1404#ifdef CONFIG_CGROUP_BCACHE
1405 cgroup_load_subsys(&bcache_subsys);
1406 init_bch_cgroup(&bcache_default_cgroup);
1407
1408 cgroup_add_cftypes(&bcache_subsys, bch_files);
1409#endif
1410 return 0;
1411}
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
new file mode 100644
index 000000000000..254d9ab5707c
--- /dev/null
+++ b/drivers/md/bcache/request.h
@@ -0,0 +1,62 @@
1#ifndef _BCACHE_REQUEST_H_
2#define _BCACHE_REQUEST_H_
3
4#include <linux/cgroup.h>
5
6struct search {
7 /* Stack frame for bio_complete */
8 struct closure cl;
9
10 struct bcache_device *d;
11 struct task_struct *task;
12
13 struct bbio bio;
14 struct bio *orig_bio;
15 struct bio *cache_miss;
16 unsigned cache_bio_sectors;
17
18 unsigned recoverable:1;
19 unsigned unaligned_bvec:1;
20
21 unsigned write:1;
22 unsigned writeback:1;
23
24 /* IO error returned to s->bio */
25 short error;
26 unsigned long start_time;
27
28 /* Anything past op->keys won't get zeroed in do_bio_hook */
29 struct btree_op op;
30};
31
32void bch_cache_read_endio(struct bio *, int);
33int bch_get_congested(struct cache_set *);
34void bch_insert_data(struct closure *cl);
35void bch_btree_insert_async(struct closure *);
36void bch_cache_read_endio(struct bio *, int);
37
38void bch_open_buckets_free(struct cache_set *);
39int bch_open_buckets_alloc(struct cache_set *);
40
41void bch_cached_dev_request_init(struct cached_dev *dc);
42void bch_flash_dev_request_init(struct bcache_device *d);
43
44extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache;
45
46struct bch_cgroup {
47#ifdef CONFIG_CGROUP_BCACHE
48 struct cgroup_subsys_state css;
49#endif
50 /*
51 * We subtract one from the index into bch_cache_modes[], so that
52 * default == -1; this makes it so the rest match up with d->cache_mode,
53 * and we use d->cache_mode if cgrp->cache_mode < 0
54 */
55 short cache_mode;
56 bool verify;
57 struct cache_stat_collector stats;
58};
59
60struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio);
61
62#endif /* _BCACHE_REQUEST_H_ */
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
new file mode 100644
index 000000000000..64e679449c2a
--- /dev/null
+++ b/drivers/md/bcache/stats.c
@@ -0,0 +1,246 @@
1/*
2 * bcache stats code
3 *
4 * Copyright 2012 Google, Inc.
5 */
6
7#include "bcache.h"
8#include "stats.h"
9#include "btree.h"
10#include "request.h"
11#include "sysfs.h"
12
13/*
14 * We keep absolute totals of various statistics, and addionally a set of three
15 * rolling averages.
16 *
17 * Every so often, a timer goes off and rescales the rolling averages.
18 * accounting_rescale[] is how many times the timer has to go off before we
19 * rescale each set of numbers; that gets us half lives of 5 minutes, one hour,
20 * and one day.
21 *
22 * accounting_delay is how often the timer goes off - 22 times in 5 minutes,
23 * and accounting_weight is what we use to rescale:
24 *
25 * pow(31 / 32, 22) ~= 1/2
26 *
27 * So that we don't have to increment each set of numbers every time we (say)
28 * get a cache hit, we increment a single atomic_t in acc->collector, and when
29 * the rescale function runs it resets the atomic counter to 0 and adds its
30 * old value to each of the exported numbers.
31 *
32 * To reduce rounding error, the numbers in struct cache_stats are all
33 * stored left shifted by 16, and scaled back in the sysfs show() function.
34 */
35
36static const unsigned DAY_RESCALE = 288;
37static const unsigned HOUR_RESCALE = 12;
38static const unsigned FIVE_MINUTE_RESCALE = 1;
39static const unsigned accounting_delay = (HZ * 300) / 22;
40static const unsigned accounting_weight = 32;
41
42/* sysfs reading/writing */
43
44read_attribute(cache_hits);
45read_attribute(cache_misses);
46read_attribute(cache_bypass_hits);
47read_attribute(cache_bypass_misses);
48read_attribute(cache_hit_ratio);
49read_attribute(cache_readaheads);
50read_attribute(cache_miss_collisions);
51read_attribute(bypassed);
52
53SHOW(bch_stats)
54{
55 struct cache_stats *s =
56 container_of(kobj, struct cache_stats, kobj);
57#define var(stat) (s->stat >> 16)
58 var_print(cache_hits);
59 var_print(cache_misses);
60 var_print(cache_bypass_hits);
61 var_print(cache_bypass_misses);
62
63 sysfs_print(cache_hit_ratio,
64 DIV_SAFE(var(cache_hits) * 100,
65 var(cache_hits) + var(cache_misses)));
66
67 var_print(cache_readaheads);
68 var_print(cache_miss_collisions);
69 sysfs_hprint(bypassed, var(sectors_bypassed) << 9);
70#undef var
71 return 0;
72}
73
74STORE(bch_stats)
75{
76 return size;
77}
78
79static void bch_stats_release(struct kobject *k)
80{
81}
82
83static struct attribute *bch_stats_files[] = {
84 &sysfs_cache_hits,
85 &sysfs_cache_misses,
86 &sysfs_cache_bypass_hits,
87 &sysfs_cache_bypass_misses,
88 &sysfs_cache_hit_ratio,
89 &sysfs_cache_readaheads,
90 &sysfs_cache_miss_collisions,
91 &sysfs_bypassed,
92 NULL
93};
94static KTYPE(bch_stats);
95
96static void scale_accounting(unsigned long data);
97
98void bch_cache_accounting_init(struct cache_accounting *acc,
99 struct closure *parent)
100{
101 kobject_init(&acc->total.kobj, &bch_stats_ktype);
102 kobject_init(&acc->five_minute.kobj, &bch_stats_ktype);
103 kobject_init(&acc->hour.kobj, &bch_stats_ktype);
104 kobject_init(&acc->day.kobj, &bch_stats_ktype);
105
106 closure_init(&acc->cl, parent);
107 init_timer(&acc->timer);
108 acc->timer.expires = jiffies + accounting_delay;
109 acc->timer.data = (unsigned long) acc;
110 acc->timer.function = scale_accounting;
111 add_timer(&acc->timer);
112}
113
114int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
115 struct kobject *parent)
116{
117 int ret = kobject_add(&acc->total.kobj, parent,
118 "stats_total");
119 ret = ret ?: kobject_add(&acc->five_minute.kobj, parent,
120 "stats_five_minute");
121 ret = ret ?: kobject_add(&acc->hour.kobj, parent,
122 "stats_hour");
123 ret = ret ?: kobject_add(&acc->day.kobj, parent,
124 "stats_day");
125 return ret;
126}
127
128void bch_cache_accounting_clear(struct cache_accounting *acc)
129{
130 memset(&acc->total.cache_hits,
131 0,
132 sizeof(unsigned long) * 7);
133}
134
135void bch_cache_accounting_destroy(struct cache_accounting *acc)
136{
137 kobject_put(&acc->total.kobj);
138 kobject_put(&acc->five_minute.kobj);
139 kobject_put(&acc->hour.kobj);
140 kobject_put(&acc->day.kobj);
141
142 atomic_set(&acc->closing, 1);
143 if (del_timer_sync(&acc->timer))
144 closure_return(&acc->cl);
145}
146
147/* EWMA scaling */
148
149static void scale_stat(unsigned long *stat)
150{
151 *stat = ewma_add(*stat, 0, accounting_weight, 0);
152}
153
154static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
155{
156 if (++stats->rescale == rescale_at) {
157 stats->rescale = 0;
158 scale_stat(&stats->cache_hits);
159 scale_stat(&stats->cache_misses);
160 scale_stat(&stats->cache_bypass_hits);
161 scale_stat(&stats->cache_bypass_misses);
162 scale_stat(&stats->cache_readaheads);
163 scale_stat(&stats->cache_miss_collisions);
164 scale_stat(&stats->sectors_bypassed);
165 }
166}
167
168static void scale_accounting(unsigned long data)
169{
170 struct cache_accounting *acc = (struct cache_accounting *) data;
171
172#define move_stat(name) do { \
173 unsigned t = atomic_xchg(&acc->collector.name, 0); \
174 t <<= 16; \
175 acc->five_minute.name += t; \
176 acc->hour.name += t; \
177 acc->day.name += t; \
178 acc->total.name += t; \
179} while (0)
180
181 move_stat(cache_hits);
182 move_stat(cache_misses);
183 move_stat(cache_bypass_hits);
184 move_stat(cache_bypass_misses);
185 move_stat(cache_readaheads);
186 move_stat(cache_miss_collisions);
187 move_stat(sectors_bypassed);
188
189 scale_stats(&acc->total, 0);
190 scale_stats(&acc->day, DAY_RESCALE);
191 scale_stats(&acc->hour, HOUR_RESCALE);
192 scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE);
193
194 acc->timer.expires += accounting_delay;
195
196 if (!atomic_read(&acc->closing))
197 add_timer(&acc->timer);
198 else
199 closure_return(&acc->cl);
200}
201
202static void mark_cache_stats(struct cache_stat_collector *stats,
203 bool hit, bool bypass)
204{
205 if (!bypass)
206 if (hit)
207 atomic_inc(&stats->cache_hits);
208 else
209 atomic_inc(&stats->cache_misses);
210 else
211 if (hit)
212 atomic_inc(&stats->cache_bypass_hits);
213 else
214 atomic_inc(&stats->cache_bypass_misses);
215}
216
217void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass)
218{
219 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
220 mark_cache_stats(&dc->accounting.collector, hit, bypass);
221 mark_cache_stats(&s->op.c->accounting.collector, hit, bypass);
222#ifdef CONFIG_CGROUP_BCACHE
223 mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
224#endif
225}
226
227void bch_mark_cache_readahead(struct search *s)
228{
229 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
230 atomic_inc(&dc->accounting.collector.cache_readaheads);
231 atomic_inc(&s->op.c->accounting.collector.cache_readaheads);
232}
233
234void bch_mark_cache_miss_collision(struct search *s)
235{
236 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
237 atomic_inc(&dc->accounting.collector.cache_miss_collisions);
238 atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions);
239}
240
241void bch_mark_sectors_bypassed(struct search *s, int sectors)
242{
243 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
244 atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
245 atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed);
246}
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h
new file mode 100644
index 000000000000..c7c7a8fd29fe
--- /dev/null
+++ b/drivers/md/bcache/stats.h
@@ -0,0 +1,58 @@
1#ifndef _BCACHE_STATS_H_
2#define _BCACHE_STATS_H_
3
4struct cache_stat_collector {
5 atomic_t cache_hits;
6 atomic_t cache_misses;
7 atomic_t cache_bypass_hits;
8 atomic_t cache_bypass_misses;
9 atomic_t cache_readaheads;
10 atomic_t cache_miss_collisions;
11 atomic_t sectors_bypassed;
12};
13
14struct cache_stats {
15 struct kobject kobj;
16
17 unsigned long cache_hits;
18 unsigned long cache_misses;
19 unsigned long cache_bypass_hits;
20 unsigned long cache_bypass_misses;
21 unsigned long cache_readaheads;
22 unsigned long cache_miss_collisions;
23 unsigned long sectors_bypassed;
24
25 unsigned rescale;
26};
27
28struct cache_accounting {
29 struct closure cl;
30 struct timer_list timer;
31 atomic_t closing;
32
33 struct cache_stat_collector collector;
34
35 struct cache_stats total;
36 struct cache_stats five_minute;
37 struct cache_stats hour;
38 struct cache_stats day;
39};
40
41struct search;
42
43void bch_cache_accounting_init(struct cache_accounting *acc,
44 struct closure *parent);
45
46int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
47 struct kobject *parent);
48
49void bch_cache_accounting_clear(struct cache_accounting *acc);
50
51void bch_cache_accounting_destroy(struct cache_accounting *acc);
52
53void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass);
54void bch_mark_cache_readahead(struct search *s);
55void bch_mark_cache_miss_collision(struct search *s);
56void bch_mark_sectors_bypassed(struct search *s, int sectors);
57
58#endif /* _BCACHE_STATS_H_ */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
new file mode 100644
index 000000000000..c8046bc4aa57
--- /dev/null
+++ b/drivers/md/bcache/super.c
@@ -0,0 +1,1987 @@
1/*
2 * bcache setup/teardown code, and some metadata io - read a superblock and
3 * figure out what to do with it.
4 *
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
7 */
8
9#include "bcache.h"
10#include "btree.h"
11#include "debug.h"
12#include "request.h"
13
14#include <linux/buffer_head.h>
15#include <linux/debugfs.h>
16#include <linux/genhd.h>
17#include <linux/module.h>
18#include <linux/random.h>
19#include <linux/reboot.h>
20#include <linux/sysfs.h>
21
22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
24
25static const char bcache_magic[] = {
26 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
27 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
28};
29
30static const char invalid_uuid[] = {
31 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
32 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
33};
34
35/* Default is -1; we skip past it for struct cached_dev's cache mode */
36const char * const bch_cache_modes[] = {
37 "default",
38 "writethrough",
39 "writeback",
40 "writearound",
41 "none",
42 NULL
43};
44
45struct uuid_entry_v0 {
46 uint8_t uuid[16];
47 uint8_t label[32];
48 uint32_t first_reg;
49 uint32_t last_reg;
50 uint32_t invalidated;
51 uint32_t pad;
52};
53
54static struct kobject *bcache_kobj;
55struct mutex bch_register_lock;
56LIST_HEAD(bch_cache_sets);
57static LIST_HEAD(uncached_devices);
58
59static int bcache_major, bcache_minor;
60static wait_queue_head_t unregister_wait;
61struct workqueue_struct *bcache_wq;
62
63#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
64
65static void bio_split_pool_free(struct bio_split_pool *p)
66{
67 if (p->bio_split_hook)
68 mempool_destroy(p->bio_split_hook);
69
70 if (p->bio_split)
71 bioset_free(p->bio_split);
72}
73
74static int bio_split_pool_init(struct bio_split_pool *p)
75{
76 p->bio_split = bioset_create(4, 0);
77 if (!p->bio_split)
78 return -ENOMEM;
79
80 p->bio_split_hook = mempool_create_kmalloc_pool(4,
81 sizeof(struct bio_split_hook));
82 if (!p->bio_split_hook)
83 return -ENOMEM;
84
85 return 0;
86}
87
88/* Superblock */
89
90static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
91 struct page **res)
92{
93 const char *err;
94 struct cache_sb *s;
95 struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
96 unsigned i;
97
98 if (!bh)
99 return "IO error";
100
101 s = (struct cache_sb *) bh->b_data;
102
103 sb->offset = le64_to_cpu(s->offset);
104 sb->version = le64_to_cpu(s->version);
105
106 memcpy(sb->magic, s->magic, 16);
107 memcpy(sb->uuid, s->uuid, 16);
108 memcpy(sb->set_uuid, s->set_uuid, 16);
109 memcpy(sb->label, s->label, SB_LABEL_SIZE);
110
111 sb->flags = le64_to_cpu(s->flags);
112 sb->seq = le64_to_cpu(s->seq);
113 sb->last_mount = le32_to_cpu(s->last_mount);
114 sb->first_bucket = le16_to_cpu(s->first_bucket);
115 sb->keys = le16_to_cpu(s->keys);
116
117 for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
118 sb->d[i] = le64_to_cpu(s->d[i]);
119
120 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
121 sb->version, sb->flags, sb->seq, sb->keys);
122
123 err = "Not a bcache superblock";
124 if (sb->offset != SB_SECTOR)
125 goto err;
126
127 if (memcmp(sb->magic, bcache_magic, 16))
128 goto err;
129
130 err = "Too many journal buckets";
131 if (sb->keys > SB_JOURNAL_BUCKETS)
132 goto err;
133
134 err = "Bad checksum";
135 if (s->csum != csum_set(s))
136 goto err;
137
138 err = "Bad UUID";
139 if (bch_is_zero(sb->uuid, 16))
140 goto err;
141
142 sb->block_size = le16_to_cpu(s->block_size);
143
144 err = "Superblock block size smaller than device block size";
145 if (sb->block_size << 9 < bdev_logical_block_size(bdev))
146 goto err;
147
148 switch (sb->version) {
149 case BCACHE_SB_VERSION_BDEV:
150 sb->data_offset = BDEV_DATA_START_DEFAULT;
151 break;
152 case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
153 sb->data_offset = le64_to_cpu(s->data_offset);
154
155 err = "Bad data offset";
156 if (sb->data_offset < BDEV_DATA_START_DEFAULT)
157 goto err;
158
159 break;
160 case BCACHE_SB_VERSION_CDEV:
161 case BCACHE_SB_VERSION_CDEV_WITH_UUID:
162 sb->nbuckets = le64_to_cpu(s->nbuckets);
163 sb->block_size = le16_to_cpu(s->block_size);
164 sb->bucket_size = le16_to_cpu(s->bucket_size);
165
166 sb->nr_in_set = le16_to_cpu(s->nr_in_set);
167 sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
168
169 err = "Too many buckets";
170 if (sb->nbuckets > LONG_MAX)
171 goto err;
172
173 err = "Not enough buckets";
174 if (sb->nbuckets < 1 << 7)
175 goto err;
176
177 err = "Bad block/bucket size";
178 if (!is_power_of_2(sb->block_size) ||
179 sb->block_size > PAGE_SECTORS ||
180 !is_power_of_2(sb->bucket_size) ||
181 sb->bucket_size < PAGE_SECTORS)
182 goto err;
183
184 err = "Invalid superblock: device too small";
185 if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
186 goto err;
187
188 err = "Bad UUID";
189 if (bch_is_zero(sb->set_uuid, 16))
190 goto err;
191
192 err = "Bad cache device number in set";
193 if (!sb->nr_in_set ||
194 sb->nr_in_set <= sb->nr_this_dev ||
195 sb->nr_in_set > MAX_CACHES_PER_SET)
196 goto err;
197
198 err = "Journal buckets not sequential";
199 for (i = 0; i < sb->keys; i++)
200 if (sb->d[i] != sb->first_bucket + i)
201 goto err;
202
203 err = "Too many journal buckets";
204 if (sb->first_bucket + sb->keys > sb->nbuckets)
205 goto err;
206
207 err = "Invalid superblock: first bucket comes before end of super";
208 if (sb->first_bucket * sb->bucket_size < 16)
209 goto err;
210
211 break;
212 default:
213 err = "Unsupported superblock version";
214 goto err;
215 }
216
217 sb->last_mount = get_seconds();
218 err = NULL;
219
220 get_page(bh->b_page);
221 *res = bh->b_page;
222err:
223 put_bh(bh);
224 return err;
225}
226
227static void write_bdev_super_endio(struct bio *bio, int error)
228{
229 struct cached_dev *dc = bio->bi_private;
230 /* XXX: error checking */
231
232 closure_put(&dc->sb_write.cl);
233}
234
235static void __write_super(struct cache_sb *sb, struct bio *bio)
236{
237 struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
238 unsigned i;
239
240 bio->bi_sector = SB_SECTOR;
241 bio->bi_rw = REQ_SYNC|REQ_META;
242 bio->bi_size = SB_SIZE;
243 bch_bio_map(bio, NULL);
244
245 out->offset = cpu_to_le64(sb->offset);
246 out->version = cpu_to_le64(sb->version);
247
248 memcpy(out->uuid, sb->uuid, 16);
249 memcpy(out->set_uuid, sb->set_uuid, 16);
250 memcpy(out->label, sb->label, SB_LABEL_SIZE);
251
252 out->flags = cpu_to_le64(sb->flags);
253 out->seq = cpu_to_le64(sb->seq);
254
255 out->last_mount = cpu_to_le32(sb->last_mount);
256 out->first_bucket = cpu_to_le16(sb->first_bucket);
257 out->keys = cpu_to_le16(sb->keys);
258
259 for (i = 0; i < sb->keys; i++)
260 out->d[i] = cpu_to_le64(sb->d[i]);
261
262 out->csum = csum_set(out);
263
264 pr_debug("ver %llu, flags %llu, seq %llu",
265 sb->version, sb->flags, sb->seq);
266
267 submit_bio(REQ_WRITE, bio);
268}
269
270void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
271{
272 struct closure *cl = &dc->sb_write.cl;
273 struct bio *bio = &dc->sb_bio;
274
275 closure_lock(&dc->sb_write, parent);
276
277 bio_reset(bio);
278 bio->bi_bdev = dc->bdev;
279 bio->bi_end_io = write_bdev_super_endio;
280 bio->bi_private = dc;
281
282 closure_get(cl);
283 __write_super(&dc->sb, bio);
284
285 closure_return(cl);
286}
287
288static void write_super_endio(struct bio *bio, int error)
289{
290 struct cache *ca = bio->bi_private;
291
292 bch_count_io_errors(ca, error, "writing superblock");
293 closure_put(&ca->set->sb_write.cl);
294}
295
296void bcache_write_super(struct cache_set *c)
297{
298 struct closure *cl = &c->sb_write.cl;
299 struct cache *ca;
300 unsigned i;
301
302 closure_lock(&c->sb_write, &c->cl);
303
304 c->sb.seq++;
305
306 for_each_cache(ca, c, i) {
307 struct bio *bio = &ca->sb_bio;
308
309 ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
310 ca->sb.seq = c->sb.seq;
311 ca->sb.last_mount = c->sb.last_mount;
312
313 SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
314
315 bio_reset(bio);
316 bio->bi_bdev = ca->bdev;
317 bio->bi_end_io = write_super_endio;
318 bio->bi_private = ca;
319
320 closure_get(cl);
321 __write_super(&ca->sb, bio);
322 }
323
324 closure_return(cl);
325}
326
327/* UUID io */
328
329static void uuid_endio(struct bio *bio, int error)
330{
331 struct closure *cl = bio->bi_private;
332 struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl);
333
334 cache_set_err_on(error, c, "accessing uuids");
335 bch_bbio_free(bio, c);
336 closure_put(cl);
337}
338
339static void uuid_io(struct cache_set *c, unsigned long rw,
340 struct bkey *k, struct closure *parent)
341{
342 struct closure *cl = &c->uuid_write.cl;
343 struct uuid_entry *u;
344 unsigned i;
345
346 BUG_ON(!parent);
347 closure_lock(&c->uuid_write, parent);
348
349 for (i = 0; i < KEY_PTRS(k); i++) {
350 struct bio *bio = bch_bbio_alloc(c);
351
352 bio->bi_rw = REQ_SYNC|REQ_META|rw;
353 bio->bi_size = KEY_SIZE(k) << 9;
354
355 bio->bi_end_io = uuid_endio;
356 bio->bi_private = cl;
357 bch_bio_map(bio, c->uuids);
358
359 bch_submit_bbio(bio, c, k, i);
360
361 if (!(rw & WRITE))
362 break;
363 }
364
365 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read",
366 pkey(&c->uuid_bucket));
367
368 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
369 if (!bch_is_zero(u->uuid, 16))
370 pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
371 u - c->uuids, u->uuid, u->label,
372 u->first_reg, u->last_reg, u->invalidated);
373
374 closure_return(cl);
375}
376
377static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
378{
379 struct bkey *k = &j->uuid_bucket;
380
381 if (__bch_ptr_invalid(c, 1, k))
382 return "bad uuid pointer";
383
384 bkey_copy(&c->uuid_bucket, k);
385 uuid_io(c, READ_SYNC, k, cl);
386
387 if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
388 struct uuid_entry_v0 *u0 = (void *) c->uuids;
389 struct uuid_entry *u1 = (void *) c->uuids;
390 int i;
391
392 closure_sync(cl);
393
394 /*
395 * Since the new uuid entry is bigger than the old, we have to
396 * convert starting at the highest memory address and work down
397 * in order to do it in place
398 */
399
400 for (i = c->nr_uuids - 1;
401 i >= 0;
402 --i) {
403 memcpy(u1[i].uuid, u0[i].uuid, 16);
404 memcpy(u1[i].label, u0[i].label, 32);
405
406 u1[i].first_reg = u0[i].first_reg;
407 u1[i].last_reg = u0[i].last_reg;
408 u1[i].invalidated = u0[i].invalidated;
409
410 u1[i].flags = 0;
411 u1[i].sectors = 0;
412 }
413 }
414
415 return NULL;
416}
417
418static int __uuid_write(struct cache_set *c)
419{
420 BKEY_PADDED(key) k;
421 struct closure cl;
422 closure_init_stack(&cl);
423
424 lockdep_assert_held(&bch_register_lock);
425
426 if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl))
427 return 1;
428
429 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
430 uuid_io(c, REQ_WRITE, &k.key, &cl);
431 closure_sync(&cl);
432
433 bkey_copy(&c->uuid_bucket, &k.key);
434 __bkey_put(c, &k.key);
435 return 0;
436}
437
438int bch_uuid_write(struct cache_set *c)
439{
440 int ret = __uuid_write(c);
441
442 if (!ret)
443 bch_journal_meta(c, NULL);
444
445 return ret;
446}
447
448static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
449{
450 struct uuid_entry *u;
451
452 for (u = c->uuids;
453 u < c->uuids + c->nr_uuids; u++)
454 if (!memcmp(u->uuid, uuid, 16))
455 return u;
456
457 return NULL;
458}
459
460static struct uuid_entry *uuid_find_empty(struct cache_set *c)
461{
462 static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
463 return uuid_find(c, zero_uuid);
464}
465
466/*
467 * Bucket priorities/gens:
468 *
469 * For each bucket, we store on disk its
470 * 8 bit gen
471 * 16 bit priority
472 *
473 * See alloc.c for an explanation of the gen. The priority is used to implement
474 * lru (and in the future other) cache replacement policies; for most purposes
475 * it's just an opaque integer.
476 *
477 * The gens and the priorities don't have a whole lot to do with each other, and
478 * it's actually the gens that must be written out at specific times - it's no
479 * big deal if the priorities don't get written, if we lose them we just reuse
480 * buckets in suboptimal order.
481 *
482 * On disk they're stored in a packed array, and in as many buckets are required
483 * to fit them all. The buckets we use to store them form a list; the journal
484 * header points to the first bucket, the first bucket points to the second
485 * bucket, et cetera.
486 *
487 * This code is used by the allocation code; periodically (whenever it runs out
488 * of buckets to allocate from) the allocation code will invalidate some
489 * buckets, but it can't use those buckets until their new gens are safely on
490 * disk.
491 */
492
493static void prio_endio(struct bio *bio, int error)
494{
495 struct cache *ca = bio->bi_private;
496
497 cache_set_err_on(error, ca->set, "accessing priorities");
498 bch_bbio_free(bio, ca->set);
499 closure_put(&ca->prio);
500}
501
502static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
503{
504 struct closure *cl = &ca->prio;
505 struct bio *bio = bch_bbio_alloc(ca->set);
506
507 closure_init_stack(cl);
508
509 bio->bi_sector = bucket * ca->sb.bucket_size;
510 bio->bi_bdev = ca->bdev;
511 bio->bi_rw = REQ_SYNC|REQ_META|rw;
512 bio->bi_size = bucket_bytes(ca);
513
514 bio->bi_end_io = prio_endio;
515 bio->bi_private = ca;
516 bch_bio_map(bio, ca->disk_buckets);
517
518 closure_bio_submit(bio, &ca->prio, ca);
519 closure_sync(cl);
520}
521
522#define buckets_free(c) "free %zu, free_inc %zu, unused %zu", \
523 fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused)
524
525void bch_prio_write(struct cache *ca)
526{
527 int i;
528 struct bucket *b;
529 struct closure cl;
530
531 closure_init_stack(&cl);
532
533 lockdep_assert_held(&ca->set->bucket_lock);
534
535 for (b = ca->buckets;
536 b < ca->buckets + ca->sb.nbuckets; b++)
537 b->disk_gen = b->gen;
538
539 ca->disk_buckets->seq++;
540
541 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
542 &ca->meta_sectors_written);
543
544 pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
545 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
546 blktrace_msg(ca, "Starting priorities: " buckets_free(ca));
547
548 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
549 long bucket;
550 struct prio_set *p = ca->disk_buckets;
551 struct bucket_disk *d = p->data;
552 struct bucket_disk *end = d + prios_per_bucket(ca);
553
554 for (b = ca->buckets + i * prios_per_bucket(ca);
555 b < ca->buckets + ca->sb.nbuckets && d < end;
556 b++, d++) {
557 d->prio = cpu_to_le16(b->prio);
558 d->gen = b->gen;
559 }
560
561 p->next_bucket = ca->prio_buckets[i + 1];
562 p->magic = pset_magic(ca);
563 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
564
565 bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl);
566 BUG_ON(bucket == -1);
567
568 mutex_unlock(&ca->set->bucket_lock);
569 prio_io(ca, bucket, REQ_WRITE);
570 mutex_lock(&ca->set->bucket_lock);
571
572 ca->prio_buckets[i] = bucket;
573 atomic_dec_bug(&ca->buckets[bucket].pin);
574 }
575
576 mutex_unlock(&ca->set->bucket_lock);
577
578 bch_journal_meta(ca->set, &cl);
579 closure_sync(&cl);
580
581 mutex_lock(&ca->set->bucket_lock);
582
583 ca->need_save_prio = 0;
584
585 /*
586 * Don't want the old priorities to get garbage collected until after we
587 * finish writing the new ones, and they're journalled
588 */
589 for (i = 0; i < prio_buckets(ca); i++)
590 ca->prio_last_buckets[i] = ca->prio_buckets[i];
591}
592
593static void prio_read(struct cache *ca, uint64_t bucket)
594{
595 struct prio_set *p = ca->disk_buckets;
596 struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
597 struct bucket *b;
598 unsigned bucket_nr = 0;
599
600 for (b = ca->buckets;
601 b < ca->buckets + ca->sb.nbuckets;
602 b++, d++) {
603 if (d == end) {
604 ca->prio_buckets[bucket_nr] = bucket;
605 ca->prio_last_buckets[bucket_nr] = bucket;
606 bucket_nr++;
607
608 prio_io(ca, bucket, READ_SYNC);
609
610 if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
611 pr_warn("bad csum reading priorities");
612
613 if (p->magic != pset_magic(ca))
614 pr_warn("bad magic reading priorities");
615
616 bucket = p->next_bucket;
617 d = p->data;
618 }
619
620 b->prio = le16_to_cpu(d->prio);
621 b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen;
622 }
623}
624
625/* Bcache device */
626
627static int open_dev(struct block_device *b, fmode_t mode)
628{
629 struct bcache_device *d = b->bd_disk->private_data;
630 if (atomic_read(&d->closing))
631 return -ENXIO;
632
633 closure_get(&d->cl);
634 return 0;
635}
636
637static int release_dev(struct gendisk *b, fmode_t mode)
638{
639 struct bcache_device *d = b->private_data;
640 closure_put(&d->cl);
641 return 0;
642}
643
644static int ioctl_dev(struct block_device *b, fmode_t mode,
645 unsigned int cmd, unsigned long arg)
646{
647 struct bcache_device *d = b->bd_disk->private_data;
648 return d->ioctl(d, mode, cmd, arg);
649}
650
651static const struct block_device_operations bcache_ops = {
652 .open = open_dev,
653 .release = release_dev,
654 .ioctl = ioctl_dev,
655 .owner = THIS_MODULE,
656};
657
658void bcache_device_stop(struct bcache_device *d)
659{
660 if (!atomic_xchg(&d->closing, 1))
661 closure_queue(&d->cl);
662}
663
664static void bcache_device_unlink(struct bcache_device *d)
665{
666 unsigned i;
667 struct cache *ca;
668
669 sysfs_remove_link(&d->c->kobj, d->name);
670 sysfs_remove_link(&d->kobj, "cache");
671
672 for_each_cache(ca, d->c, i)
673 bd_unlink_disk_holder(ca->bdev, d->disk);
674}
675
676static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
677 const char *name)
678{
679 unsigned i;
680 struct cache *ca;
681
682 for_each_cache(ca, d->c, i)
683 bd_link_disk_holder(ca->bdev, d->disk);
684
685 snprintf(d->name, BCACHEDEVNAME_SIZE,
686 "%s%u", name, d->id);
687
688 WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
689 sysfs_create_link(&c->kobj, &d->kobj, d->name),
690 "Couldn't create device <-> cache set symlinks");
691}
692
693static void bcache_device_detach(struct bcache_device *d)
694{
695 lockdep_assert_held(&bch_register_lock);
696
697 if (atomic_read(&d->detaching)) {
698 struct uuid_entry *u = d->c->uuids + d->id;
699
700 SET_UUID_FLASH_ONLY(u, 0);
701 memcpy(u->uuid, invalid_uuid, 16);
702 u->invalidated = cpu_to_le32(get_seconds());
703 bch_uuid_write(d->c);
704
705 atomic_set(&d->detaching, 0);
706 }
707
708 bcache_device_unlink(d);
709
710 d->c->devices[d->id] = NULL;
711 closure_put(&d->c->caching);
712 d->c = NULL;
713}
714
715static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
716 unsigned id)
717{
718 BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags));
719
720 d->id = id;
721 d->c = c;
722 c->devices[id] = d;
723
724 closure_get(&c->caching);
725}
726
727static void bcache_device_free(struct bcache_device *d)
728{
729 lockdep_assert_held(&bch_register_lock);
730
731 pr_info("%s stopped", d->disk->disk_name);
732
733 if (d->c)
734 bcache_device_detach(d);
735
736 if (d->disk)
737 del_gendisk(d->disk);
738 if (d->disk && d->disk->queue)
739 blk_cleanup_queue(d->disk->queue);
740 if (d->disk)
741 put_disk(d->disk);
742
743 bio_split_pool_free(&d->bio_split_hook);
744 if (d->unaligned_bvec)
745 mempool_destroy(d->unaligned_bvec);
746 if (d->bio_split)
747 bioset_free(d->bio_split);
748
749 closure_debug_destroy(&d->cl);
750}
751
752static int bcache_device_init(struct bcache_device *d, unsigned block_size)
753{
754 struct request_queue *q;
755
756 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
757 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
758 sizeof(struct bio_vec) * BIO_MAX_PAGES)) ||
759 bio_split_pool_init(&d->bio_split_hook))
760
761 return -ENOMEM;
762
763 d->disk = alloc_disk(1);
764 if (!d->disk)
765 return -ENOMEM;
766
767 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor);
768
769 d->disk->major = bcache_major;
770 d->disk->first_minor = bcache_minor++;
771 d->disk->fops = &bcache_ops;
772 d->disk->private_data = d;
773
774 q = blk_alloc_queue(GFP_KERNEL);
775 if (!q)
776 return -ENOMEM;
777
778 blk_queue_make_request(q, NULL);
779 d->disk->queue = q;
780 q->queuedata = d;
781 q->backing_dev_info.congested_data = d;
782 q->limits.max_hw_sectors = UINT_MAX;
783 q->limits.max_sectors = UINT_MAX;
784 q->limits.max_segment_size = UINT_MAX;
785 q->limits.max_segments = BIO_MAX_PAGES;
786 q->limits.max_discard_sectors = UINT_MAX;
787 q->limits.io_min = block_size;
788 q->limits.logical_block_size = block_size;
789 q->limits.physical_block_size = block_size;
790 set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
791 set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
792
793 return 0;
794}
795
796/* Cached device */
797
798static void calc_cached_dev_sectors(struct cache_set *c)
799{
800 uint64_t sectors = 0;
801 struct cached_dev *dc;
802
803 list_for_each_entry(dc, &c->cached_devs, list)
804 sectors += bdev_sectors(dc->bdev);
805
806 c->cached_dev_sectors = sectors;
807}
808
809void bch_cached_dev_run(struct cached_dev *dc)
810{
811 struct bcache_device *d = &dc->disk;
812
813 if (atomic_xchg(&dc->running, 1))
814 return;
815
816 if (!d->c &&
817 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
818 struct closure cl;
819 closure_init_stack(&cl);
820
821 SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
822 bch_write_bdev_super(dc, &cl);
823 closure_sync(&cl);
824 }
825
826 add_disk(d->disk);
827 bd_link_disk_holder(dc->bdev, dc->disk.disk);
828#if 0
829 char *env[] = { "SYMLINK=label" , NULL };
830 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
831#endif
832 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
833 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
834 pr_debug("error creating sysfs link");
835}
836
837static void cached_dev_detach_finish(struct work_struct *w)
838{
839 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
840 char buf[BDEVNAME_SIZE];
841 struct closure cl;
842 closure_init_stack(&cl);
843
844 BUG_ON(!atomic_read(&dc->disk.detaching));
845 BUG_ON(atomic_read(&dc->count));
846
847 mutex_lock(&bch_register_lock);
848
849 memset(&dc->sb.set_uuid, 0, 16);
850 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
851
852 bch_write_bdev_super(dc, &cl);
853 closure_sync(&cl);
854
855 bcache_device_detach(&dc->disk);
856 list_move(&dc->list, &uncached_devices);
857
858 mutex_unlock(&bch_register_lock);
859
860 pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
861
862 /* Drop ref we took in cached_dev_detach() */
863 closure_put(&dc->disk.cl);
864}
865
866void bch_cached_dev_detach(struct cached_dev *dc)
867{
868 lockdep_assert_held(&bch_register_lock);
869
870 if (atomic_read(&dc->disk.closing))
871 return;
872
873 if (atomic_xchg(&dc->disk.detaching, 1))
874 return;
875
876 /*
877 * Block the device from being closed and freed until we're finished
878 * detaching
879 */
880 closure_get(&dc->disk.cl);
881
882 bch_writeback_queue(dc);
883 cached_dev_put(dc);
884}
885
886int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
887{
888 uint32_t rtime = cpu_to_le32(get_seconds());
889 struct uuid_entry *u;
890 char buf[BDEVNAME_SIZE];
891
892 bdevname(dc->bdev, buf);
893
894 if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))
895 return -ENOENT;
896
897 if (dc->disk.c) {
898 pr_err("Can't attach %s: already attached", buf);
899 return -EINVAL;
900 }
901
902 if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
903 pr_err("Can't attach %s: shutting down", buf);
904 return -EINVAL;
905 }
906
907 if (dc->sb.block_size < c->sb.block_size) {
908 /* Will die */
909 pr_err("Couldn't attach %s: block size less than set's block size",
910 buf);
911 return -EINVAL;
912 }
913
914 u = uuid_find(c, dc->sb.uuid);
915
916 if (u &&
917 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
918 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
919 memcpy(u->uuid, invalid_uuid, 16);
920 u->invalidated = cpu_to_le32(get_seconds());
921 u = NULL;
922 }
923
924 if (!u) {
925 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
926 pr_err("Couldn't find uuid for %s in set", buf);
927 return -ENOENT;
928 }
929
930 u = uuid_find_empty(c);
931 if (!u) {
932 pr_err("Not caching %s, no room for UUID", buf);
933 return -EINVAL;
934 }
935 }
936
937 /* Deadlocks since we're called via sysfs...
938 sysfs_remove_file(&dc->kobj, &sysfs_attach);
939 */
940
941 if (bch_is_zero(u->uuid, 16)) {
942 struct closure cl;
943 closure_init_stack(&cl);
944
945 memcpy(u->uuid, dc->sb.uuid, 16);
946 memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
947 u->first_reg = u->last_reg = rtime;
948 bch_uuid_write(c);
949
950 memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
951 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
952
953 bch_write_bdev_super(dc, &cl);
954 closure_sync(&cl);
955 } else {
956 u->last_reg = rtime;
957 bch_uuid_write(c);
958 }
959
960 bcache_device_attach(&dc->disk, c, u - c->uuids);
961 list_move(&dc->list, &c->cached_devs);
962 calc_cached_dev_sectors(c);
963
964 smp_wmb();
965 /*
966 * dc->c must be set before dc->count != 0 - paired with the mb in
967 * cached_dev_get()
968 */
969 atomic_set(&dc->count, 1);
970
971 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
972 atomic_set(&dc->has_dirty, 1);
973 atomic_inc(&dc->count);
974 bch_writeback_queue(dc);
975 }
976
977 bch_cached_dev_run(dc);
978 bcache_device_link(&dc->disk, c, "bdev");
979
980 pr_info("Caching %s as %s on set %pU",
981 bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
982 dc->disk.c->sb.set_uuid);
983 return 0;
984}
985
986void bch_cached_dev_release(struct kobject *kobj)
987{
988 struct cached_dev *dc = container_of(kobj, struct cached_dev,
989 disk.kobj);
990 kfree(dc);
991 module_put(THIS_MODULE);
992}
993
994static void cached_dev_free(struct closure *cl)
995{
996 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
997
998 cancel_delayed_work_sync(&dc->writeback_rate_update);
999
1000 mutex_lock(&bch_register_lock);
1001
1002 bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
1003 bcache_device_free(&dc->disk);
1004 list_del(&dc->list);
1005
1006 mutex_unlock(&bch_register_lock);
1007
1008 if (!IS_ERR_OR_NULL(dc->bdev)) {
1009 blk_sync_queue(bdev_get_queue(dc->bdev));
1010 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1011 }
1012
1013 wake_up(&unregister_wait);
1014
1015 kobject_put(&dc->disk.kobj);
1016}
1017
1018static void cached_dev_flush(struct closure *cl)
1019{
1020 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1021 struct bcache_device *d = &dc->disk;
1022
1023 bch_cache_accounting_destroy(&dc->accounting);
1024 kobject_del(&d->kobj);
1025
1026 continue_at(cl, cached_dev_free, system_wq);
1027}
1028
1029static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1030{
1031 int err;
1032 struct io *io;
1033
1034 closure_init(&dc->disk.cl, NULL);
1035 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1036
1037 __module_get(THIS_MODULE);
1038 INIT_LIST_HEAD(&dc->list);
1039 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1040
1041 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1042
1043 err = bcache_device_init(&dc->disk, block_size);
1044 if (err)
1045 goto err;
1046
1047 spin_lock_init(&dc->io_lock);
1048 closure_init_unlocked(&dc->sb_write);
1049 INIT_WORK(&dc->detach, cached_dev_detach_finish);
1050
1051 dc->sequential_merge = true;
1052 dc->sequential_cutoff = 4 << 20;
1053
1054 INIT_LIST_HEAD(&dc->io_lru);
1055 dc->sb_bio.bi_max_vecs = 1;
1056 dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs;
1057
1058 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1059 list_add(&io->lru, &dc->io_lru);
1060 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1061 }
1062
1063 bch_writeback_init_cached_dev(dc);
1064 return 0;
1065err:
1066 bcache_device_stop(&dc->disk);
1067 return err;
1068}
1069
1070/* Cached device - bcache superblock */
1071
1072static const char *register_bdev(struct cache_sb *sb, struct page *sb_page,
1073 struct block_device *bdev,
1074 struct cached_dev *dc)
1075{
1076 char name[BDEVNAME_SIZE];
1077 const char *err = "cannot allocate memory";
1078 struct gendisk *g;
1079 struct cache_set *c;
1080
1081 if (!dc || cached_dev_init(dc, sb->block_size << 9) != 0)
1082 return err;
1083
1084 memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1085 dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
1086 dc->bdev = bdev;
1087 dc->bdev->bd_holder = dc;
1088
1089 g = dc->disk.disk;
1090
1091 set_capacity(g, dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1092
1093 g->queue->backing_dev_info.ra_pages =
1094 max(g->queue->backing_dev_info.ra_pages,
1095 bdev->bd_queue->backing_dev_info.ra_pages);
1096
1097 bch_cached_dev_request_init(dc);
1098
1099 err = "error creating kobject";
1100 if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
1101 "bcache"))
1102 goto err;
1103 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1104 goto err;
1105
1106 list_add(&dc->list, &uncached_devices);
1107 list_for_each_entry(c, &bch_cache_sets, list)
1108 bch_cached_dev_attach(dc, c);
1109
1110 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1111 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
1112 bch_cached_dev_run(dc);
1113
1114 return NULL;
1115err:
1116 kobject_put(&dc->disk.kobj);
1117 pr_notice("error opening %s: %s", bdevname(bdev, name), err);
1118 /*
1119 * Return NULL instead of an error because kobject_put() cleans
1120 * everything up
1121 */
1122 return NULL;
1123}
1124
1125/* Flash only volumes */
1126
1127void bch_flash_dev_release(struct kobject *kobj)
1128{
1129 struct bcache_device *d = container_of(kobj, struct bcache_device,
1130 kobj);
1131 kfree(d);
1132}
1133
1134static void flash_dev_free(struct closure *cl)
1135{
1136 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1137 bcache_device_free(d);
1138 kobject_put(&d->kobj);
1139}
1140
1141static void flash_dev_flush(struct closure *cl)
1142{
1143 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1144
1145 bcache_device_unlink(d);
1146 kobject_del(&d->kobj);
1147 continue_at(cl, flash_dev_free, system_wq);
1148}
1149
1150static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1151{
1152 struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1153 GFP_KERNEL);
1154 if (!d)
1155 return -ENOMEM;
1156
1157 closure_init(&d->cl, NULL);
1158 set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1159
1160 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1161
1162 if (bcache_device_init(d, block_bytes(c)))
1163 goto err;
1164
1165 bcache_device_attach(d, c, u - c->uuids);
1166 set_capacity(d->disk, u->sectors);
1167 bch_flash_dev_request_init(d);
1168 add_disk(d->disk);
1169
1170 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1171 goto err;
1172
1173 bcache_device_link(d, c, "volume");
1174
1175 return 0;
1176err:
1177 kobject_put(&d->kobj);
1178 return -ENOMEM;
1179}
1180
1181static int flash_devs_run(struct cache_set *c)
1182{
1183 int ret = 0;
1184 struct uuid_entry *u;
1185
1186 for (u = c->uuids;
1187 u < c->uuids + c->nr_uuids && !ret;
1188 u++)
1189 if (UUID_FLASH_ONLY(u))
1190 ret = flash_dev_run(c, u);
1191
1192 return ret;
1193}
1194
1195int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1196{
1197 struct uuid_entry *u;
1198
1199 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1200 return -EINTR;
1201
1202 u = uuid_find_empty(c);
1203 if (!u) {
1204 pr_err("Can't create volume, no room for UUID");
1205 return -EINVAL;
1206 }
1207
1208 get_random_bytes(u->uuid, 16);
1209 memset(u->label, 0, 32);
1210 u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
1211
1212 SET_UUID_FLASH_ONLY(u, 1);
1213 u->sectors = size >> 9;
1214
1215 bch_uuid_write(c);
1216
1217 return flash_dev_run(c, u);
1218}
1219
1220/* Cache set */
1221
1222__printf(2, 3)
1223bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1224{
1225 va_list args;
1226
1227 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1228 return false;
1229
1230 /* XXX: we can be called from atomic context
1231 acquire_console_sem();
1232 */
1233
1234 printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid);
1235
1236 va_start(args, fmt);
1237 vprintk(fmt, args);
1238 va_end(args);
1239
1240 printk(", disabling caching\n");
1241
1242 bch_cache_set_unregister(c);
1243 return true;
1244}
1245
1246void bch_cache_set_release(struct kobject *kobj)
1247{
1248 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1249 kfree(c);
1250 module_put(THIS_MODULE);
1251}
1252
1253static void cache_set_free(struct closure *cl)
1254{
1255 struct cache_set *c = container_of(cl, struct cache_set, cl);
1256 struct cache *ca;
1257 unsigned i;
1258
1259 if (!IS_ERR_OR_NULL(c->debug))
1260 debugfs_remove(c->debug);
1261
1262 bch_open_buckets_free(c);
1263 bch_btree_cache_free(c);
1264 bch_journal_free(c);
1265
1266 for_each_cache(ca, c, i)
1267 if (ca)
1268 kobject_put(&ca->kobj);
1269
1270 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1271 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
1272
1273 kfree(c->fill_iter);
1274 if (c->bio_split)
1275 bioset_free(c->bio_split);
1276 if (c->bio_meta)
1277 mempool_destroy(c->bio_meta);
1278 if (c->search)
1279 mempool_destroy(c->search);
1280 kfree(c->devices);
1281
1282 mutex_lock(&bch_register_lock);
1283 list_del(&c->list);
1284 mutex_unlock(&bch_register_lock);
1285
1286 pr_info("Cache set %pU unregistered", c->sb.set_uuid);
1287 wake_up(&unregister_wait);
1288
1289 closure_debug_destroy(&c->cl);
1290 kobject_put(&c->kobj);
1291}
1292
1293static void cache_set_flush(struct closure *cl)
1294{
1295 struct cache_set *c = container_of(cl, struct cache_set, caching);
1296 struct btree *b;
1297
1298 /* Shut down allocator threads */
1299 set_bit(CACHE_SET_STOPPING_2, &c->flags);
1300 wake_up(&c->alloc_wait);
1301
1302 bch_cache_accounting_destroy(&c->accounting);
1303
1304 kobject_put(&c->internal);
1305 kobject_del(&c->kobj);
1306
1307 if (!IS_ERR_OR_NULL(c->root))
1308 list_add(&c->root->list, &c->btree_cache);
1309
1310 /* Should skip this if we're unregistering because of an error */
1311 list_for_each_entry(b, &c->btree_cache, list)
1312 if (btree_node_dirty(b))
1313 bch_btree_write(b, true, NULL);
1314
1315 closure_return(cl);
1316}
1317
1318static void __cache_set_unregister(struct closure *cl)
1319{
1320 struct cache_set *c = container_of(cl, struct cache_set, caching);
1321 struct cached_dev *dc, *t;
1322 size_t i;
1323
1324 mutex_lock(&bch_register_lock);
1325
1326 if (test_bit(CACHE_SET_UNREGISTERING, &c->flags))
1327 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
1328 bch_cached_dev_detach(dc);
1329
1330 for (i = 0; i < c->nr_uuids; i++)
1331 if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i]))
1332 bcache_device_stop(c->devices[i]);
1333
1334 mutex_unlock(&bch_register_lock);
1335
1336 continue_at(cl, cache_set_flush, system_wq);
1337}
1338
1339void bch_cache_set_stop(struct cache_set *c)
1340{
1341 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1342 closure_queue(&c->caching);
1343}
1344
1345void bch_cache_set_unregister(struct cache_set *c)
1346{
1347 set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1348 bch_cache_set_stop(c);
1349}
1350
1351#define alloc_bucket_pages(gfp, c) \
1352 ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
1353
1354struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1355{
1356 int iter_size;
1357 struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1358 if (!c)
1359 return NULL;
1360
1361 __module_get(THIS_MODULE);
1362 closure_init(&c->cl, NULL);
1363 set_closure_fn(&c->cl, cache_set_free, system_wq);
1364
1365 closure_init(&c->caching, &c->cl);
1366 set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1367
1368 /* Maybe create continue_at_noreturn() and use it here? */
1369 closure_set_stopped(&c->cl);
1370 closure_put(&c->cl);
1371
1372 kobject_init(&c->kobj, &bch_cache_set_ktype);
1373 kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1374
1375 bch_cache_accounting_init(&c->accounting, &c->cl);
1376
1377 memcpy(c->sb.set_uuid, sb->set_uuid, 16);
1378 c->sb.block_size = sb->block_size;
1379 c->sb.bucket_size = sb->bucket_size;
1380 c->sb.nr_in_set = sb->nr_in_set;
1381 c->sb.last_mount = sb->last_mount;
1382 c->bucket_bits = ilog2(sb->bucket_size);
1383 c->block_bits = ilog2(sb->block_size);
1384 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1385
1386 c->btree_pages = c->sb.bucket_size / PAGE_SECTORS;
1387 if (c->btree_pages > BTREE_MAX_PAGES)
1388 c->btree_pages = max_t(int, c->btree_pages / 4,
1389 BTREE_MAX_PAGES);
1390
1391 init_waitqueue_head(&c->alloc_wait);
1392 mutex_init(&c->bucket_lock);
1393 mutex_init(&c->fill_lock);
1394 mutex_init(&c->sort_lock);
1395 spin_lock_init(&c->sort_time_lock);
1396 closure_init_unlocked(&c->sb_write);
1397 closure_init_unlocked(&c->uuid_write);
1398 spin_lock_init(&c->btree_read_time_lock);
1399 bch_moving_init_cache_set(c);
1400
1401 INIT_LIST_HEAD(&c->list);
1402 INIT_LIST_HEAD(&c->cached_devs);
1403 INIT_LIST_HEAD(&c->btree_cache);
1404 INIT_LIST_HEAD(&c->btree_cache_freeable);
1405 INIT_LIST_HEAD(&c->btree_cache_freed);
1406 INIT_LIST_HEAD(&c->data_buckets);
1407
1408 c->search = mempool_create_slab_pool(32, bch_search_cache);
1409 if (!c->search)
1410 goto err;
1411
1412 iter_size = (sb->bucket_size / sb->block_size + 1) *
1413 sizeof(struct btree_iter_set);
1414
1415 if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
1416 !(c->bio_meta = mempool_create_kmalloc_pool(2,
1417 sizeof(struct bbio) + sizeof(struct bio_vec) *
1418 bucket_pages(c))) ||
1419 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1420 !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) ||
1421 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
1422 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1423 bch_journal_alloc(c) ||
1424 bch_btree_cache_alloc(c) ||
1425 bch_open_buckets_alloc(c))
1426 goto err;
1427
1428 c->fill_iter->size = sb->bucket_size / sb->block_size;
1429
1430 c->congested_read_threshold_us = 2000;
1431 c->congested_write_threshold_us = 20000;
1432 c->error_limit = 8 << IO_ERROR_SHIFT;
1433
1434 return c;
1435err:
1436 bch_cache_set_unregister(c);
1437 return NULL;
1438}
1439
1440static void run_cache_set(struct cache_set *c)
1441{
1442 const char *err = "cannot allocate memory";
1443 struct cached_dev *dc, *t;
1444 struct cache *ca;
1445 unsigned i;
1446
1447 struct btree_op op;
1448 bch_btree_op_init_stack(&op);
1449 op.lock = SHRT_MAX;
1450
1451 for_each_cache(ca, c, i)
1452 c->nbuckets += ca->sb.nbuckets;
1453
1454 if (CACHE_SYNC(&c->sb)) {
1455 LIST_HEAD(journal);
1456 struct bkey *k;
1457 struct jset *j;
1458
1459 err = "cannot allocate memory for journal";
1460 if (bch_journal_read(c, &journal, &op))
1461 goto err;
1462
1463 pr_debug("btree_journal_read() done");
1464
1465 err = "no journal entries found";
1466 if (list_empty(&journal))
1467 goto err;
1468
1469 j = &list_entry(journal.prev, struct journal_replay, list)->j;
1470
1471 err = "IO error reading priorities";
1472 for_each_cache(ca, c, i)
1473 prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
1474
1475 /*
1476 * If prio_read() fails it'll call cache_set_error and we'll
1477 * tear everything down right away, but if we perhaps checked
1478 * sooner we could avoid journal replay.
1479 */
1480
1481 k = &j->btree_root;
1482
1483 err = "bad btree root";
1484 if (__bch_ptr_invalid(c, j->btree_level + 1, k))
1485 goto err;
1486
1487 err = "error reading btree root";
1488 c->root = bch_btree_node_get(c, k, j->btree_level, &op);
1489 if (IS_ERR_OR_NULL(c->root))
1490 goto err;
1491
1492 list_del_init(&c->root->list);
1493 rw_unlock(true, c->root);
1494
1495 err = uuid_read(c, j, &op.cl);
1496 if (err)
1497 goto err;
1498
1499 err = "error in recovery";
1500 if (bch_btree_check(c, &op))
1501 goto err;
1502
1503 bch_journal_mark(c, &journal);
1504 bch_btree_gc_finish(c);
1505 pr_debug("btree_check() done");
1506
1507 /*
1508 * bcache_journal_next() can't happen sooner, or
1509 * btree_gc_finish() will give spurious errors about last_gc >
1510 * gc_gen - this is a hack but oh well.
1511 */
1512 bch_journal_next(&c->journal);
1513
1514 for_each_cache(ca, c, i)
1515 closure_call(&ca->alloc, bch_allocator_thread,
1516 system_wq, &c->cl);
1517
1518 /*
1519 * First place it's safe to allocate: btree_check() and
1520 * btree_gc_finish() have to run before we have buckets to
1521 * allocate, and bch_bucket_alloc_set() might cause a journal
1522 * entry to be written so bcache_journal_next() has to be called
1523 * first.
1524 *
1525 * If the uuids were in the old format we have to rewrite them
1526 * before the next journal entry is written:
1527 */
1528 if (j->version < BCACHE_JSET_VERSION_UUID)
1529 __uuid_write(c);
1530
1531 bch_journal_replay(c, &journal, &op);
1532 } else {
1533 pr_notice("invalidating existing data");
1534 /* Don't want invalidate_buckets() to queue a gc yet */
1535 closure_lock(&c->gc, NULL);
1536
1537 for_each_cache(ca, c, i) {
1538 unsigned j;
1539
1540 ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
1541 2, SB_JOURNAL_BUCKETS);
1542
1543 for (j = 0; j < ca->sb.keys; j++)
1544 ca->sb.d[j] = ca->sb.first_bucket + j;
1545 }
1546
1547 bch_btree_gc_finish(c);
1548
1549 for_each_cache(ca, c, i)
1550 closure_call(&ca->alloc, bch_allocator_thread,
1551 ca->alloc_workqueue, &c->cl);
1552
1553 mutex_lock(&c->bucket_lock);
1554 for_each_cache(ca, c, i)
1555 bch_prio_write(ca);
1556 mutex_unlock(&c->bucket_lock);
1557
1558 wake_up(&c->alloc_wait);
1559
1560 err = "cannot allocate new UUID bucket";
1561 if (__uuid_write(c))
1562 goto err_unlock_gc;
1563
1564 err = "cannot allocate new btree root";
1565 c->root = bch_btree_node_alloc(c, 0, &op.cl);
1566 if (IS_ERR_OR_NULL(c->root))
1567 goto err_unlock_gc;
1568
1569 bkey_copy_key(&c->root->key, &MAX_KEY);
1570 bch_btree_write(c->root, true, &op);
1571
1572 bch_btree_set_root(c->root);
1573 rw_unlock(true, c->root);
1574
1575 /*
1576 * We don't want to write the first journal entry until
1577 * everything is set up - fortunately journal entries won't be
1578 * written until the SET_CACHE_SYNC() here:
1579 */
1580 SET_CACHE_SYNC(&c->sb, true);
1581
1582 bch_journal_next(&c->journal);
1583 bch_journal_meta(c, &op.cl);
1584
1585 /* Unlock */
1586 closure_set_stopped(&c->gc.cl);
1587 closure_put(&c->gc.cl);
1588 }
1589
1590 closure_sync(&op.cl);
1591 c->sb.last_mount = get_seconds();
1592 bcache_write_super(c);
1593
1594 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1595 bch_cached_dev_attach(dc, c);
1596
1597 flash_devs_run(c);
1598
1599 return;
1600err_unlock_gc:
1601 closure_set_stopped(&c->gc.cl);
1602 closure_put(&c->gc.cl);
1603err:
1604 closure_sync(&op.cl);
1605 /* XXX: test this, it's broken */
1606 bch_cache_set_error(c, err);
1607}
1608
1609static bool can_attach_cache(struct cache *ca, struct cache_set *c)
1610{
1611 return ca->sb.block_size == c->sb.block_size &&
1612 ca->sb.bucket_size == c->sb.block_size &&
1613 ca->sb.nr_in_set == c->sb.nr_in_set;
1614}
1615
1616static const char *register_cache_set(struct cache *ca)
1617{
1618 char buf[12];
1619 const char *err = "cannot allocate memory";
1620 struct cache_set *c;
1621
1622 list_for_each_entry(c, &bch_cache_sets, list)
1623 if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
1624 if (c->cache[ca->sb.nr_this_dev])
1625 return "duplicate cache set member";
1626
1627 if (!can_attach_cache(ca, c))
1628 return "cache sb does not match set";
1629
1630 if (!CACHE_SYNC(&ca->sb))
1631 SET_CACHE_SYNC(&c->sb, false);
1632
1633 goto found;
1634 }
1635
1636 c = bch_cache_set_alloc(&ca->sb);
1637 if (!c)
1638 return err;
1639
1640 err = "error creating kobject";
1641 if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
1642 kobject_add(&c->internal, &c->kobj, "internal"))
1643 goto err;
1644
1645 if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
1646 goto err;
1647
1648 bch_debug_init_cache_set(c);
1649
1650 list_add(&c->list, &bch_cache_sets);
1651found:
1652 sprintf(buf, "cache%i", ca->sb.nr_this_dev);
1653 if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
1654 sysfs_create_link(&c->kobj, &ca->kobj, buf))
1655 goto err;
1656
1657 if (ca->sb.seq > c->sb.seq) {
1658 c->sb.version = ca->sb.version;
1659 memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
1660 c->sb.flags = ca->sb.flags;
1661 c->sb.seq = ca->sb.seq;
1662 pr_debug("set version = %llu", c->sb.version);
1663 }
1664
1665 ca->set = c;
1666 ca->set->cache[ca->sb.nr_this_dev] = ca;
1667 c->cache_by_alloc[c->caches_loaded++] = ca;
1668
1669 if (c->caches_loaded == c->sb.nr_in_set)
1670 run_cache_set(c);
1671
1672 return NULL;
1673err:
1674 bch_cache_set_unregister(c);
1675 return err;
1676}
1677
1678/* Cache device */
1679
1680void bch_cache_release(struct kobject *kobj)
1681{
1682 struct cache *ca = container_of(kobj, struct cache, kobj);
1683
1684 if (ca->set)
1685 ca->set->cache[ca->sb.nr_this_dev] = NULL;
1686
1687 bch_cache_allocator_exit(ca);
1688
1689 bio_split_pool_free(&ca->bio_split_hook);
1690
1691 if (ca->alloc_workqueue)
1692 destroy_workqueue(ca->alloc_workqueue);
1693
1694 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1695 kfree(ca->prio_buckets);
1696 vfree(ca->buckets);
1697
1698 free_heap(&ca->heap);
1699 free_fifo(&ca->unused);
1700 free_fifo(&ca->free_inc);
1701 free_fifo(&ca->free);
1702
1703 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
1704 put_page(ca->sb_bio.bi_io_vec[0].bv_page);
1705
1706 if (!IS_ERR_OR_NULL(ca->bdev)) {
1707 blk_sync_queue(bdev_get_queue(ca->bdev));
1708 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1709 }
1710
1711 kfree(ca);
1712 module_put(THIS_MODULE);
1713}
1714
1715static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1716{
1717 size_t free;
1718 struct bucket *b;
1719
1720 if (!ca)
1721 return -ENOMEM;
1722
1723 __module_get(THIS_MODULE);
1724 kobject_init(&ca->kobj, &bch_cache_ktype);
1725
1726 memcpy(&ca->sb, sb, sizeof(struct cache_sb));
1727
1728 INIT_LIST_HEAD(&ca->discards);
1729
1730 bio_init(&ca->sb_bio);
1731 ca->sb_bio.bi_max_vecs = 1;
1732 ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs;
1733
1734 bio_init(&ca->journal.bio);
1735 ca->journal.bio.bi_max_vecs = 8;
1736 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
1737
1738 free = roundup_pow_of_two(ca->sb.nbuckets) >> 9;
1739 free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2);
1740
1741 if (!init_fifo(&ca->free, free, GFP_KERNEL) ||
1742 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
1743 !init_fifo(&ca->unused, free << 2, GFP_KERNEL) ||
1744 !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
1745 !(ca->buckets = vmalloc(sizeof(struct bucket) *
1746 ca->sb.nbuckets)) ||
1747 !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1748 2, GFP_KERNEL)) ||
1749 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) ||
1750 !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) ||
1751 bio_split_pool_init(&ca->bio_split_hook))
1752 goto err;
1753
1754 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
1755
1756 memset(ca->buckets, 0, ca->sb.nbuckets * sizeof(struct bucket));
1757 for_each_bucket(b, ca)
1758 atomic_set(&b->pin, 0);
1759
1760 if (bch_cache_allocator_init(ca))
1761 goto err;
1762
1763 return 0;
1764err:
1765 kobject_put(&ca->kobj);
1766 return -ENOMEM;
1767}
1768
1769static const char *register_cache(struct cache_sb *sb, struct page *sb_page,
1770 struct block_device *bdev, struct cache *ca)
1771{
1772 char name[BDEVNAME_SIZE];
1773 const char *err = "cannot allocate memory";
1774
1775 if (cache_alloc(sb, ca) != 0)
1776 return err;
1777
1778 ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
1779 ca->bdev = bdev;
1780 ca->bdev->bd_holder = ca;
1781
1782 if (blk_queue_discard(bdev_get_queue(ca->bdev)))
1783 ca->discard = CACHE_DISCARD(&ca->sb);
1784
1785 err = "error creating kobject";
1786 if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache"))
1787 goto err;
1788
1789 err = register_cache_set(ca);
1790 if (err)
1791 goto err;
1792
1793 pr_info("registered cache device %s", bdevname(bdev, name));
1794
1795 return NULL;
1796err:
1797 kobject_put(&ca->kobj);
1798 pr_info("error opening %s: %s", bdevname(bdev, name), err);
1799 /* Return NULL instead of an error because kobject_put() cleans
1800 * everything up
1801 */
1802 return NULL;
1803}
1804
1805/* Global interfaces/init */
1806
1807static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1808 const char *, size_t);
1809
1810kobj_attribute_write(register, register_bcache);
1811kobj_attribute_write(register_quiet, register_bcache);
1812
1813static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1814 const char *buffer, size_t size)
1815{
1816 ssize_t ret = size;
1817 const char *err = "cannot allocate memory";
1818 char *path = NULL;
1819 struct cache_sb *sb = NULL;
1820 struct block_device *bdev = NULL;
1821 struct page *sb_page = NULL;
1822
1823 if (!try_module_get(THIS_MODULE))
1824 return -EBUSY;
1825
1826 mutex_lock(&bch_register_lock);
1827
1828 if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
1829 !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
1830 goto err;
1831
1832 err = "failed to open device";
1833 bdev = blkdev_get_by_path(strim(path),
1834 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1835 sb);
1836 if (bdev == ERR_PTR(-EBUSY))
1837 err = "device busy";
1838
1839 if (IS_ERR(bdev) ||
1840 set_blocksize(bdev, 4096))
1841 goto err;
1842
1843 err = read_super(sb, bdev, &sb_page);
1844 if (err)
1845 goto err_close;
1846
1847 if (SB_IS_BDEV(sb)) {
1848 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
1849
1850 err = register_bdev(sb, sb_page, bdev, dc);
1851 } else {
1852 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1853
1854 err = register_cache(sb, sb_page, bdev, ca);
1855 }
1856
1857 if (err) {
1858 /* register_(bdev|cache) will only return an error if they
1859 * didn't get far enough to create the kobject - if they did,
1860 * the kobject destructor will do this cleanup.
1861 */
1862 put_page(sb_page);
1863err_close:
1864 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1865err:
1866 if (attr != &ksysfs_register_quiet)
1867 pr_info("error opening %s: %s", path, err);
1868 ret = -EINVAL;
1869 }
1870
1871 kfree(sb);
1872 kfree(path);
1873 mutex_unlock(&bch_register_lock);
1874 module_put(THIS_MODULE);
1875 return ret;
1876}
1877
1878static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
1879{
1880 if (code == SYS_DOWN ||
1881 code == SYS_HALT ||
1882 code == SYS_POWER_OFF) {
1883 DEFINE_WAIT(wait);
1884 unsigned long start = jiffies;
1885 bool stopped = false;
1886
1887 struct cache_set *c, *tc;
1888 struct cached_dev *dc, *tdc;
1889
1890 mutex_lock(&bch_register_lock);
1891
1892 if (list_empty(&bch_cache_sets) &&
1893 list_empty(&uncached_devices))
1894 goto out;
1895
1896 pr_info("Stopping all devices:");
1897
1898 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1899 bch_cache_set_stop(c);
1900
1901 list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
1902 bcache_device_stop(&dc->disk);
1903
1904 /* What's a condition variable? */
1905 while (1) {
1906 long timeout = start + 2 * HZ - jiffies;
1907
1908 stopped = list_empty(&bch_cache_sets) &&
1909 list_empty(&uncached_devices);
1910
1911 if (timeout < 0 || stopped)
1912 break;
1913
1914 prepare_to_wait(&unregister_wait, &wait,
1915 TASK_UNINTERRUPTIBLE);
1916
1917 mutex_unlock(&bch_register_lock);
1918 schedule_timeout(timeout);
1919 mutex_lock(&bch_register_lock);
1920 }
1921
1922 finish_wait(&unregister_wait, &wait);
1923
1924 if (stopped)
1925 pr_info("All devices stopped");
1926 else
1927 pr_notice("Timeout waiting for devices to be closed");
1928out:
1929 mutex_unlock(&bch_register_lock);
1930 }
1931
1932 return NOTIFY_DONE;
1933}
1934
1935static struct notifier_block reboot = {
1936 .notifier_call = bcache_reboot,
1937 .priority = INT_MAX, /* before any real devices */
1938};
1939
1940static void bcache_exit(void)
1941{
1942 bch_debug_exit();
1943 bch_writeback_exit();
1944 bch_request_exit();
1945 bch_btree_exit();
1946 if (bcache_kobj)
1947 kobject_put(bcache_kobj);
1948 if (bcache_wq)
1949 destroy_workqueue(bcache_wq);
1950 unregister_blkdev(bcache_major, "bcache");
1951 unregister_reboot_notifier(&reboot);
1952}
1953
1954static int __init bcache_init(void)
1955{
1956 static const struct attribute *files[] = {
1957 &ksysfs_register.attr,
1958 &ksysfs_register_quiet.attr,
1959 NULL
1960 };
1961
1962 mutex_init(&bch_register_lock);
1963 init_waitqueue_head(&unregister_wait);
1964 register_reboot_notifier(&reboot);
1965 closure_debug_init();
1966
1967 bcache_major = register_blkdev(0, "bcache");
1968 if (bcache_major < 0)
1969 return bcache_major;
1970
1971 if (!(bcache_wq = create_workqueue("bcache")) ||
1972 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
1973 sysfs_create_files(bcache_kobj, files) ||
1974 bch_btree_init() ||
1975 bch_request_init() ||
1976 bch_writeback_init() ||
1977 bch_debug_init(bcache_kobj))
1978 goto err;
1979
1980 return 0;
1981err:
1982 bcache_exit();
1983 return -ENOMEM;
1984}
1985
1986module_exit(bcache_exit);
1987module_init(bcache_init);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
new file mode 100644
index 000000000000..4d9cca47e4c6
--- /dev/null
+++ b/drivers/md/bcache/sysfs.c
@@ -0,0 +1,817 @@
1/*
2 * bcache sysfs interfaces
3 *
4 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5 * Copyright 2012 Google, Inc.
6 */
7
8#include "bcache.h"
9#include "sysfs.h"
10#include "btree.h"
11#include "request.h"
12
13#include <linux/sort.h>
14
15static const char * const cache_replacement_policies[] = {
16 "lru",
17 "fifo",
18 "random",
19 NULL
20};
21
22write_attribute(attach);
23write_attribute(detach);
24write_attribute(unregister);
25write_attribute(stop);
26write_attribute(clear_stats);
27write_attribute(trigger_gc);
28write_attribute(prune_cache);
29write_attribute(flash_vol_create);
30
31read_attribute(bucket_size);
32read_attribute(block_size);
33read_attribute(nbuckets);
34read_attribute(tree_depth);
35read_attribute(root_usage_percent);
36read_attribute(priority_stats);
37read_attribute(btree_cache_size);
38read_attribute(btree_cache_max_chain);
39read_attribute(cache_available_percent);
40read_attribute(written);
41read_attribute(btree_written);
42read_attribute(metadata_written);
43read_attribute(active_journal_entries);
44
45sysfs_time_stats_attribute(btree_gc, sec, ms);
46sysfs_time_stats_attribute(btree_split, sec, us);
47sysfs_time_stats_attribute(btree_sort, ms, us);
48sysfs_time_stats_attribute(btree_read, ms, us);
49sysfs_time_stats_attribute(try_harder, ms, us);
50
51read_attribute(btree_nodes);
52read_attribute(btree_used_percent);
53read_attribute(average_key_size);
54read_attribute(dirty_data);
55read_attribute(bset_tree_stats);
56
57read_attribute(state);
58read_attribute(cache_read_races);
59read_attribute(writeback_keys_done);
60read_attribute(writeback_keys_failed);
61read_attribute(io_errors);
62read_attribute(congested);
63rw_attribute(congested_read_threshold_us);
64rw_attribute(congested_write_threshold_us);
65
66rw_attribute(sequential_cutoff);
67rw_attribute(sequential_merge);
68rw_attribute(data_csum);
69rw_attribute(cache_mode);
70rw_attribute(writeback_metadata);
71rw_attribute(writeback_running);
72rw_attribute(writeback_percent);
73rw_attribute(writeback_delay);
74rw_attribute(writeback_rate);
75
76rw_attribute(writeback_rate_update_seconds);
77rw_attribute(writeback_rate_d_term);
78rw_attribute(writeback_rate_p_term_inverse);
79rw_attribute(writeback_rate_d_smooth);
80read_attribute(writeback_rate_debug);
81
82rw_attribute(synchronous);
83rw_attribute(journal_delay_ms);
84rw_attribute(discard);
85rw_attribute(running);
86rw_attribute(label);
87rw_attribute(readahead);
88rw_attribute(io_error_limit);
89rw_attribute(io_error_halflife);
90rw_attribute(verify);
91rw_attribute(key_merging_disabled);
92rw_attribute(gc_always_rewrite);
93rw_attribute(freelist_percent);
94rw_attribute(cache_replacement_policy);
95rw_attribute(btree_shrinker_disabled);
96rw_attribute(copy_gc_enabled);
97rw_attribute(size);
98
99SHOW(__bch_cached_dev)
100{
101 struct cached_dev *dc = container_of(kobj, struct cached_dev,
102 disk.kobj);
103 const char *states[] = { "no cache", "clean", "dirty", "inconsistent" };
104
105#define var(stat) (dc->stat)
106
107 if (attr == &sysfs_cache_mode)
108 return bch_snprint_string_list(buf, PAGE_SIZE,
109 bch_cache_modes + 1,
110 BDEV_CACHE_MODE(&dc->sb));
111
112 sysfs_printf(data_csum, "%i", dc->disk.data_csum);
113 var_printf(verify, "%i");
114 var_printf(writeback_metadata, "%i");
115 var_printf(writeback_running, "%i");
116 var_print(writeback_delay);
117 var_print(writeback_percent);
118 sysfs_print(writeback_rate, dc->writeback_rate.rate);
119
120 var_print(writeback_rate_update_seconds);
121 var_print(writeback_rate_d_term);
122 var_print(writeback_rate_p_term_inverse);
123 var_print(writeback_rate_d_smooth);
124
125 if (attr == &sysfs_writeback_rate_debug) {
126 char dirty[20];
127 char derivative[20];
128 char target[20];
129 bch_hprint(dirty,
130 atomic_long_read(&dc->disk.sectors_dirty) << 9);
131 bch_hprint(derivative, dc->writeback_rate_derivative << 9);
132 bch_hprint(target, dc->writeback_rate_target << 9);
133
134 return sprintf(buf,
135 "rate:\t\t%u\n"
136 "change:\t\t%i\n"
137 "dirty:\t\t%s\n"
138 "derivative:\t%s\n"
139 "target:\t\t%s\n",
140 dc->writeback_rate.rate,
141 dc->writeback_rate_change,
142 dirty, derivative, target);
143 }
144
145 sysfs_hprint(dirty_data,
146 atomic_long_read(&dc->disk.sectors_dirty) << 9);
147
148 var_printf(sequential_merge, "%i");
149 var_hprint(sequential_cutoff);
150 var_hprint(readahead);
151
152 sysfs_print(running, atomic_read(&dc->running));
153 sysfs_print(state, states[BDEV_STATE(&dc->sb)]);
154
155 if (attr == &sysfs_label) {
156 memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
157 buf[SB_LABEL_SIZE + 1] = '\0';
158 strcat(buf, "\n");
159 return strlen(buf);
160 }
161
162#undef var
163 return 0;
164}
165SHOW_LOCKED(bch_cached_dev)
166
167STORE(__cached_dev)
168{
169 struct cached_dev *dc = container_of(kobj, struct cached_dev,
170 disk.kobj);
171 unsigned v = size;
172 struct cache_set *c;
173
174#define d_strtoul(var) sysfs_strtoul(var, dc->var)
175#define d_strtoi_h(var) sysfs_hatoi(var, dc->var)
176
177 sysfs_strtoul(data_csum, dc->disk.data_csum);
178 d_strtoul(verify);
179 d_strtoul(writeback_metadata);
180 d_strtoul(writeback_running);
181 d_strtoul(writeback_delay);
182 sysfs_strtoul_clamp(writeback_rate,
183 dc->writeback_rate.rate, 1, 1000000);
184 sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
185
186 d_strtoul(writeback_rate_update_seconds);
187 d_strtoul(writeback_rate_d_term);
188 d_strtoul(writeback_rate_p_term_inverse);
189 sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
190 dc->writeback_rate_p_term_inverse, 1, INT_MAX);
191 d_strtoul(writeback_rate_d_smooth);
192
193 d_strtoul(sequential_merge);
194 d_strtoi_h(sequential_cutoff);
195 d_strtoi_h(readahead);
196
197 if (attr == &sysfs_clear_stats)
198 bch_cache_accounting_clear(&dc->accounting);
199
200 if (attr == &sysfs_running &&
201 strtoul_or_return(buf))
202 bch_cached_dev_run(dc);
203
204 if (attr == &sysfs_cache_mode) {
205 ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1);
206
207 if (v < 0)
208 return v;
209
210 if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) {
211 SET_BDEV_CACHE_MODE(&dc->sb, v);
212 bch_write_bdev_super(dc, NULL);
213 }
214 }
215
216 if (attr == &sysfs_label) {
217 memcpy(dc->sb.label, buf, SB_LABEL_SIZE);
218 bch_write_bdev_super(dc, NULL);
219 if (dc->disk.c) {
220 memcpy(dc->disk.c->uuids[dc->disk.id].label,
221 buf, SB_LABEL_SIZE);
222 bch_uuid_write(dc->disk.c);
223 }
224 }
225
226 if (attr == &sysfs_attach) {
227 if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16)
228 return -EINVAL;
229
230 list_for_each_entry(c, &bch_cache_sets, list) {
231 v = bch_cached_dev_attach(dc, c);
232 if (!v)
233 return size;
234 }
235
236 pr_err("Can't attach %s: cache set not found", buf);
237 size = v;
238 }
239
240 if (attr == &sysfs_detach && dc->disk.c)
241 bch_cached_dev_detach(dc);
242
243 if (attr == &sysfs_stop)
244 bcache_device_stop(&dc->disk);
245
246 return size;
247}
248
249STORE(bch_cached_dev)
250{
251 struct cached_dev *dc = container_of(kobj, struct cached_dev,
252 disk.kobj);
253
254 mutex_lock(&bch_register_lock);
255 size = __cached_dev_store(kobj, attr, buf, size);
256
257 if (attr == &sysfs_writeback_running)
258 bch_writeback_queue(dc);
259
260 if (attr == &sysfs_writeback_percent)
261 schedule_delayed_work(&dc->writeback_rate_update,
262 dc->writeback_rate_update_seconds * HZ);
263
264 mutex_unlock(&bch_register_lock);
265 return size;
266}
267
268static struct attribute *bch_cached_dev_files[] = {
269 &sysfs_attach,
270 &sysfs_detach,
271 &sysfs_stop,
272#if 0
273 &sysfs_data_csum,
274#endif
275 &sysfs_cache_mode,
276 &sysfs_writeback_metadata,
277 &sysfs_writeback_running,
278 &sysfs_writeback_delay,
279 &sysfs_writeback_percent,
280 &sysfs_writeback_rate,
281 &sysfs_writeback_rate_update_seconds,
282 &sysfs_writeback_rate_d_term,
283 &sysfs_writeback_rate_p_term_inverse,
284 &sysfs_writeback_rate_d_smooth,
285 &sysfs_writeback_rate_debug,
286 &sysfs_dirty_data,
287 &sysfs_sequential_cutoff,
288 &sysfs_sequential_merge,
289 &sysfs_clear_stats,
290 &sysfs_running,
291 &sysfs_state,
292 &sysfs_label,
293 &sysfs_readahead,
294#ifdef CONFIG_BCACHE_DEBUG
295 &sysfs_verify,
296#endif
297 NULL
298};
299KTYPE(bch_cached_dev);
300
301SHOW(bch_flash_dev)
302{
303 struct bcache_device *d = container_of(kobj, struct bcache_device,
304 kobj);
305 struct uuid_entry *u = &d->c->uuids[d->id];
306
307 sysfs_printf(data_csum, "%i", d->data_csum);
308 sysfs_hprint(size, u->sectors << 9);
309
310 if (attr == &sysfs_label) {
311 memcpy(buf, u->label, SB_LABEL_SIZE);
312 buf[SB_LABEL_SIZE + 1] = '\0';
313 strcat(buf, "\n");
314 return strlen(buf);
315 }
316
317 return 0;
318}
319
320STORE(__bch_flash_dev)
321{
322 struct bcache_device *d = container_of(kobj, struct bcache_device,
323 kobj);
324 struct uuid_entry *u = &d->c->uuids[d->id];
325
326 sysfs_strtoul(data_csum, d->data_csum);
327
328 if (attr == &sysfs_size) {
329 uint64_t v;
330 strtoi_h_or_return(buf, v);
331
332 u->sectors = v >> 9;
333 bch_uuid_write(d->c);
334 set_capacity(d->disk, u->sectors);
335 }
336
337 if (attr == &sysfs_label) {
338 memcpy(u->label, buf, SB_LABEL_SIZE);
339 bch_uuid_write(d->c);
340 }
341
342 if (attr == &sysfs_unregister) {
343 atomic_set(&d->detaching, 1);
344 bcache_device_stop(d);
345 }
346
347 return size;
348}
349STORE_LOCKED(bch_flash_dev)
350
351static struct attribute *bch_flash_dev_files[] = {
352 &sysfs_unregister,
353#if 0
354 &sysfs_data_csum,
355#endif
356 &sysfs_label,
357 &sysfs_size,
358 NULL
359};
360KTYPE(bch_flash_dev);
361
362SHOW(__bch_cache_set)
363{
364 unsigned root_usage(struct cache_set *c)
365 {
366 unsigned bytes = 0;
367 struct bkey *k;
368 struct btree *b;
369 struct btree_iter iter;
370
371 goto lock_root;
372
373 do {
374 rw_unlock(false, b);
375lock_root:
376 b = c->root;
377 rw_lock(false, b, b->level);
378 } while (b != c->root);
379
380 for_each_key_filter(b, k, &iter, bch_ptr_bad)
381 bytes += bkey_bytes(k);
382
383 rw_unlock(false, b);
384
385 return (bytes * 100) / btree_bytes(c);
386 }
387
388 size_t cache_size(struct cache_set *c)
389 {
390 size_t ret = 0;
391 struct btree *b;
392
393 mutex_lock(&c->bucket_lock);
394 list_for_each_entry(b, &c->btree_cache, list)
395 ret += 1 << (b->page_order + PAGE_SHIFT);
396
397 mutex_unlock(&c->bucket_lock);
398 return ret;
399 }
400
401 unsigned cache_max_chain(struct cache_set *c)
402 {
403 unsigned ret = 0;
404 struct hlist_head *h;
405
406 mutex_lock(&c->bucket_lock);
407
408 for (h = c->bucket_hash;
409 h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
410 h++) {
411 unsigned i = 0;
412 struct hlist_node *p;
413
414 hlist_for_each(p, h)
415 i++;
416
417 ret = max(ret, i);
418 }
419
420 mutex_unlock(&c->bucket_lock);
421 return ret;
422 }
423
424 unsigned btree_used(struct cache_set *c)
425 {
426 return div64_u64(c->gc_stats.key_bytes * 100,
427 (c->gc_stats.nodes ?: 1) * btree_bytes(c));
428 }
429
430 unsigned average_key_size(struct cache_set *c)
431 {
432 return c->gc_stats.nkeys
433 ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
434 : 0;
435 }
436
437 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
438
439 sysfs_print(synchronous, CACHE_SYNC(&c->sb));
440 sysfs_print(journal_delay_ms, c->journal_delay_ms);
441 sysfs_hprint(bucket_size, bucket_bytes(c));
442 sysfs_hprint(block_size, block_bytes(c));
443 sysfs_print(tree_depth, c->root->level);
444 sysfs_print(root_usage_percent, root_usage(c));
445
446 sysfs_hprint(btree_cache_size, cache_size(c));
447 sysfs_print(btree_cache_max_chain, cache_max_chain(c));
448 sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use);
449
450 sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms);
451 sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
452 sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us);
453 sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
454 sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us);
455
456 sysfs_print(btree_used_percent, btree_used(c));
457 sysfs_print(btree_nodes, c->gc_stats.nodes);
458 sysfs_hprint(dirty_data, c->gc_stats.dirty);
459 sysfs_hprint(average_key_size, average_key_size(c));
460
461 sysfs_print(cache_read_races,
462 atomic_long_read(&c->cache_read_races));
463
464 sysfs_print(writeback_keys_done,
465 atomic_long_read(&c->writeback_keys_done));
466 sysfs_print(writeback_keys_failed,
467 atomic_long_read(&c->writeback_keys_failed));
468
469 /* See count_io_errors for why 88 */
470 sysfs_print(io_error_halflife, c->error_decay * 88);
471 sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT);
472
473 sysfs_hprint(congested,
474 ((uint64_t) bch_get_congested(c)) << 9);
475 sysfs_print(congested_read_threshold_us,
476 c->congested_read_threshold_us);
477 sysfs_print(congested_write_threshold_us,
478 c->congested_write_threshold_us);
479
480 sysfs_print(active_journal_entries, fifo_used(&c->journal.pin));
481 sysfs_printf(verify, "%i", c->verify);
482 sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled);
483 sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
484 sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
485 sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
486
487 if (attr == &sysfs_bset_tree_stats)
488 return bch_bset_print_stats(c, buf);
489
490 return 0;
491}
492SHOW_LOCKED(bch_cache_set)
493
494STORE(__bch_cache_set)
495{
496 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
497
498 if (attr == &sysfs_unregister)
499 bch_cache_set_unregister(c);
500
501 if (attr == &sysfs_stop)
502 bch_cache_set_stop(c);
503
504 if (attr == &sysfs_synchronous) {
505 bool sync = strtoul_or_return(buf);
506
507 if (sync != CACHE_SYNC(&c->sb)) {
508 SET_CACHE_SYNC(&c->sb, sync);
509 bcache_write_super(c);
510 }
511 }
512
513 if (attr == &sysfs_flash_vol_create) {
514 int r;
515 uint64_t v;
516 strtoi_h_or_return(buf, v);
517
518 r = bch_flash_dev_create(c, v);
519 if (r)
520 return r;
521 }
522
523 if (attr == &sysfs_clear_stats) {
524 atomic_long_set(&c->writeback_keys_done, 0);
525 atomic_long_set(&c->writeback_keys_failed, 0);
526
527 memset(&c->gc_stats, 0, sizeof(struct gc_stat));
528 bch_cache_accounting_clear(&c->accounting);
529 }
530
531 if (attr == &sysfs_trigger_gc)
532 bch_queue_gc(c);
533
534 if (attr == &sysfs_prune_cache) {
535 struct shrink_control sc;
536 sc.gfp_mask = GFP_KERNEL;
537 sc.nr_to_scan = strtoul_or_return(buf);
538 c->shrink.shrink(&c->shrink, &sc);
539 }
540
541 sysfs_strtoul(congested_read_threshold_us,
542 c->congested_read_threshold_us);
543 sysfs_strtoul(congested_write_threshold_us,
544 c->congested_write_threshold_us);
545
546 if (attr == &sysfs_io_error_limit)
547 c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
548
549 /* See count_io_errors() for why 88 */
550 if (attr == &sysfs_io_error_halflife)
551 c->error_decay = strtoul_or_return(buf) / 88;
552
553 sysfs_strtoul(journal_delay_ms, c->journal_delay_ms);
554 sysfs_strtoul(verify, c->verify);
555 sysfs_strtoul(key_merging_disabled, c->key_merging_disabled);
556 sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite);
557 sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled);
558 sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled);
559
560 return size;
561}
562STORE_LOCKED(bch_cache_set)
563
564SHOW(bch_cache_set_internal)
565{
566 struct cache_set *c = container_of(kobj, struct cache_set, internal);
567 return bch_cache_set_show(&c->kobj, attr, buf);
568}
569
570STORE(bch_cache_set_internal)
571{
572 struct cache_set *c = container_of(kobj, struct cache_set, internal);
573 return bch_cache_set_store(&c->kobj, attr, buf, size);
574}
575
576static void bch_cache_set_internal_release(struct kobject *k)
577{
578}
579
580static struct attribute *bch_cache_set_files[] = {
581 &sysfs_unregister,
582 &sysfs_stop,
583 &sysfs_synchronous,
584 &sysfs_journal_delay_ms,
585 &sysfs_flash_vol_create,
586
587 &sysfs_bucket_size,
588 &sysfs_block_size,
589 &sysfs_tree_depth,
590 &sysfs_root_usage_percent,
591 &sysfs_btree_cache_size,
592 &sysfs_cache_available_percent,
593
594 &sysfs_average_key_size,
595 &sysfs_dirty_data,
596
597 &sysfs_io_error_limit,
598 &sysfs_io_error_halflife,
599 &sysfs_congested,
600 &sysfs_congested_read_threshold_us,
601 &sysfs_congested_write_threshold_us,
602 &sysfs_clear_stats,
603 NULL
604};
605KTYPE(bch_cache_set);
606
607static struct attribute *bch_cache_set_internal_files[] = {
608 &sysfs_active_journal_entries,
609
610 sysfs_time_stats_attribute_list(btree_gc, sec, ms)
611 sysfs_time_stats_attribute_list(btree_split, sec, us)
612 sysfs_time_stats_attribute_list(btree_sort, ms, us)
613 sysfs_time_stats_attribute_list(btree_read, ms, us)
614 sysfs_time_stats_attribute_list(try_harder, ms, us)
615
616 &sysfs_btree_nodes,
617 &sysfs_btree_used_percent,
618 &sysfs_btree_cache_max_chain,
619
620 &sysfs_bset_tree_stats,
621 &sysfs_cache_read_races,
622 &sysfs_writeback_keys_done,
623 &sysfs_writeback_keys_failed,
624
625 &sysfs_trigger_gc,
626 &sysfs_prune_cache,
627#ifdef CONFIG_BCACHE_DEBUG
628 &sysfs_verify,
629 &sysfs_key_merging_disabled,
630#endif
631 &sysfs_gc_always_rewrite,
632 &sysfs_btree_shrinker_disabled,
633 &sysfs_copy_gc_enabled,
634 NULL
635};
636KTYPE(bch_cache_set_internal);
637
638SHOW(__bch_cache)
639{
640 struct cache *ca = container_of(kobj, struct cache, kobj);
641
642 sysfs_hprint(bucket_size, bucket_bytes(ca));
643 sysfs_hprint(block_size, block_bytes(ca));
644 sysfs_print(nbuckets, ca->sb.nbuckets);
645 sysfs_print(discard, ca->discard);
646 sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9);
647 sysfs_hprint(btree_written,
648 atomic_long_read(&ca->btree_sectors_written) << 9);
649 sysfs_hprint(metadata_written,
650 (atomic_long_read(&ca->meta_sectors_written) +
651 atomic_long_read(&ca->btree_sectors_written)) << 9);
652
653 sysfs_print(io_errors,
654 atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
655
656 sysfs_print(freelist_percent, ca->free.size * 100 /
657 ((size_t) ca->sb.nbuckets));
658
659 if (attr == &sysfs_cache_replacement_policy)
660 return bch_snprint_string_list(buf, PAGE_SIZE,
661 cache_replacement_policies,
662 CACHE_REPLACEMENT(&ca->sb));
663
664 if (attr == &sysfs_priority_stats) {
665 int cmp(const void *l, const void *r)
666 { return *((uint16_t *) r) - *((uint16_t *) l); }
667
668 /* Number of quantiles we compute */
669 const unsigned nq = 31;
670
671 size_t n = ca->sb.nbuckets, i, unused, btree;
672 uint64_t sum = 0;
673 uint16_t q[nq], *p, *cached;
674 ssize_t ret;
675
676 cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t));
677 if (!p)
678 return -ENOMEM;
679
680 mutex_lock(&ca->set->bucket_lock);
681 for (i = ca->sb.first_bucket; i < n; i++)
682 p[i] = ca->buckets[i].prio;
683 mutex_unlock(&ca->set->bucket_lock);
684
685 sort(p, n, sizeof(uint16_t), cmp, NULL);
686
687 while (n &&
688 !cached[n - 1])
689 --n;
690
691 unused = ca->sb.nbuckets - n;
692
693 while (cached < p + n &&
694 *cached == BTREE_PRIO)
695 cached++;
696
697 btree = cached - p;
698 n -= btree;
699
700 for (i = 0; i < n; i++)
701 sum += INITIAL_PRIO - cached[i];
702
703 if (n)
704 do_div(sum, n);
705
706 for (i = 0; i < nq; i++)
707 q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)];
708
709 vfree(p);
710
711 ret = snprintf(buf, PAGE_SIZE,
712 "Unused: %zu%%\n"
713 "Metadata: %zu%%\n"
714 "Average: %llu\n"
715 "Sectors per Q: %zu\n"
716 "Quantiles: [",
717 unused * 100 / (size_t) ca->sb.nbuckets,
718 btree * 100 / (size_t) ca->sb.nbuckets, sum,
719 n * ca->sb.bucket_size / (nq + 1));
720
721 for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++)
722 ret += snprintf(buf + ret, PAGE_SIZE - ret,
723 i < nq - 1 ? "%u " : "%u]\n", q[i]);
724
725 buf[PAGE_SIZE - 1] = '\0';
726 return ret;
727 }
728
729 return 0;
730}
731SHOW_LOCKED(bch_cache)
732
733STORE(__bch_cache)
734{
735 struct cache *ca = container_of(kobj, struct cache, kobj);
736
737 if (attr == &sysfs_discard) {
738 bool v = strtoul_or_return(buf);
739
740 if (blk_queue_discard(bdev_get_queue(ca->bdev)))
741 ca->discard = v;
742
743 if (v != CACHE_DISCARD(&ca->sb)) {
744 SET_CACHE_DISCARD(&ca->sb, v);
745 bcache_write_super(ca->set);
746 }
747 }
748
749 if (attr == &sysfs_cache_replacement_policy) {
750 ssize_t v = bch_read_string_list(buf, cache_replacement_policies);
751
752 if (v < 0)
753 return v;
754
755 if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) {
756 mutex_lock(&ca->set->bucket_lock);
757 SET_CACHE_REPLACEMENT(&ca->sb, v);
758 mutex_unlock(&ca->set->bucket_lock);
759
760 bcache_write_super(ca->set);
761 }
762 }
763
764 if (attr == &sysfs_freelist_percent) {
765 DECLARE_FIFO(long, free);
766 long i;
767 size_t p = strtoul_or_return(buf);
768
769 p = clamp_t(size_t,
770 ((size_t) ca->sb.nbuckets * p) / 100,
771 roundup_pow_of_two(ca->sb.nbuckets) >> 9,
772 ca->sb.nbuckets / 2);
773
774 if (!init_fifo_exact(&free, p, GFP_KERNEL))
775 return -ENOMEM;
776
777 mutex_lock(&ca->set->bucket_lock);
778
779 fifo_move(&free, &ca->free);
780 fifo_swap(&free, &ca->free);
781
782 mutex_unlock(&ca->set->bucket_lock);
783
784 while (fifo_pop(&free, i))
785 atomic_dec(&ca->buckets[i].pin);
786
787 free_fifo(&free);
788 }
789
790 if (attr == &sysfs_clear_stats) {
791 atomic_long_set(&ca->sectors_written, 0);
792 atomic_long_set(&ca->btree_sectors_written, 0);
793 atomic_long_set(&ca->meta_sectors_written, 0);
794 atomic_set(&ca->io_count, 0);
795 atomic_set(&ca->io_errors, 0);
796 }
797
798 return size;
799}
800STORE_LOCKED(bch_cache)
801
802static struct attribute *bch_cache_files[] = {
803 &sysfs_bucket_size,
804 &sysfs_block_size,
805 &sysfs_nbuckets,
806 &sysfs_priority_stats,
807 &sysfs_discard,
808 &sysfs_written,
809 &sysfs_btree_written,
810 &sysfs_metadata_written,
811 &sysfs_io_errors,
812 &sysfs_clear_stats,
813 &sysfs_freelist_percent,
814 &sysfs_cache_replacement_policy,
815 NULL
816};
817KTYPE(bch_cache);
diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h
new file mode 100644
index 000000000000..0526fe92a683
--- /dev/null
+++ b/drivers/md/bcache/sysfs.h
@@ -0,0 +1,110 @@
1#ifndef _BCACHE_SYSFS_H_
2#define _BCACHE_SYSFS_H_
3
4#define KTYPE(type) \
5struct kobj_type type ## _ktype = { \
6 .release = type ## _release, \
7 .sysfs_ops = &((const struct sysfs_ops) { \
8 .show = type ## _show, \
9 .store = type ## _store \
10 }), \
11 .default_attrs = type ## _files \
12}
13
14#define SHOW(fn) \
15static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
16 char *buf) \
17
18#define STORE(fn) \
19static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
20 const char *buf, size_t size) \
21
22#define SHOW_LOCKED(fn) \
23SHOW(fn) \
24{ \
25 ssize_t ret; \
26 mutex_lock(&bch_register_lock); \
27 ret = __ ## fn ## _show(kobj, attr, buf); \
28 mutex_unlock(&bch_register_lock); \
29 return ret; \
30}
31
32#define STORE_LOCKED(fn) \
33STORE(fn) \
34{ \
35 ssize_t ret; \
36 mutex_lock(&bch_register_lock); \
37 ret = __ ## fn ## _store(kobj, attr, buf, size); \
38 mutex_unlock(&bch_register_lock); \
39 return ret; \
40}
41
42#define __sysfs_attribute(_name, _mode) \
43 static struct attribute sysfs_##_name = \
44 { .name = #_name, .mode = _mode }
45
46#define write_attribute(n) __sysfs_attribute(n, S_IWUSR)
47#define read_attribute(n) __sysfs_attribute(n, S_IRUGO)
48#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR)
49
50#define sysfs_printf(file, fmt, ...) \
51do { \
52 if (attr == &sysfs_ ## file) \
53 return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__); \
54} while (0)
55
56#define sysfs_print(file, var) \
57do { \
58 if (attr == &sysfs_ ## file) \
59 return snprint(buf, PAGE_SIZE, var); \
60} while (0)
61
62#define sysfs_hprint(file, val) \
63do { \
64 if (attr == &sysfs_ ## file) { \
65 ssize_t ret = bch_hprint(buf, val); \
66 strcat(buf, "\n"); \
67 return ret + 1; \
68 } \
69} while (0)
70
71#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var))
72#define var_print(_var) sysfs_print(_var, var(_var))
73#define var_hprint(_var) sysfs_hprint(_var, var(_var))
74
75#define sysfs_strtoul(file, var) \
76do { \
77 if (attr == &sysfs_ ## file) \
78 return strtoul_safe(buf, var) ?: (ssize_t) size; \
79} while (0)
80
81#define sysfs_strtoul_clamp(file, var, min, max) \
82do { \
83 if (attr == &sysfs_ ## file) \
84 return strtoul_safe_clamp(buf, var, min, max) \
85 ?: (ssize_t) size; \
86} while (0)
87
88#define strtoul_or_return(cp) \
89({ \
90 unsigned long _v; \
91 int _r = kstrtoul(cp, 10, &_v); \
92 if (_r) \
93 return _r; \
94 _v; \
95})
96
97#define strtoi_h_or_return(cp, v) \
98do { \
99 int _r = strtoi_h(cp, &v); \
100 if (_r) \
101 return _r; \
102} while (0)
103
104#define sysfs_hatoi(file, var) \
105do { \
106 if (attr == &sysfs_ ## file) \
107 return strtoi_h(buf, &var) ?: (ssize_t) size; \
108} while (0)
109
110#endif /* _BCACHE_SYSFS_H_ */
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c
new file mode 100644
index 000000000000..983f9bb411bc
--- /dev/null
+++ b/drivers/md/bcache/trace.c
@@ -0,0 +1,26 @@
1#include "bcache.h"
2#include "btree.h"
3#include "request.h"
4
5#include <linux/module.h>
6
7#define CREATE_TRACE_POINTS
8#include <trace/events/bcache.h>
9
10EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start);
11EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end);
12EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough);
13EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit);
14EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss);
15EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry);
16EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough);
17EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
18EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip);
19EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read);
20EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write);
21EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty);
22EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty);
23EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
24EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
25EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start);
26EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end);
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
new file mode 100644
index 000000000000..da3a99e85b1e
--- /dev/null
+++ b/drivers/md/bcache/util.c
@@ -0,0 +1,377 @@
1/*
2 * random utiility code, for bcache but in theory not specific to bcache
3 *
4 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
5 * Copyright 2012 Google, Inc.
6 */
7
8#include <linux/bio.h>
9#include <linux/blkdev.h>
10#include <linux/ctype.h>
11#include <linux/debugfs.h>
12#include <linux/module.h>
13#include <linux/seq_file.h>
14#include <linux/types.h>
15
16#include "util.h"
17
18#define simple_strtoint(c, end, base) simple_strtol(c, end, base)
19#define simple_strtouint(c, end, base) simple_strtoul(c, end, base)
20
21#define STRTO_H(name, type) \
22int bch_ ## name ## _h(const char *cp, type *res) \
23{ \
24 int u = 0; \
25 char *e; \
26 type i = simple_ ## name(cp, &e, 10); \
27 \
28 switch (tolower(*e)) { \
29 default: \
30 return -EINVAL; \
31 case 'y': \
32 case 'z': \
33 u++; \
34 case 'e': \
35 u++; \
36 case 'p': \
37 u++; \
38 case 't': \
39 u++; \
40 case 'g': \
41 u++; \
42 case 'm': \
43 u++; \
44 case 'k': \
45 u++; \
46 if (e++ == cp) \
47 return -EINVAL; \
48 case '\n': \
49 case '\0': \
50 if (*e == '\n') \
51 e++; \
52 } \
53 \
54 if (*e) \
55 return -EINVAL; \
56 \
57 while (u--) { \
58 if ((type) ~0 > 0 && \
59 (type) ~0 / 1024 <= i) \
60 return -EINVAL; \
61 if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \
62 (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \
63 return -EINVAL; \
64 i *= 1024; \
65 } \
66 \
67 *res = i; \
68 return 0; \
69} \
70
71STRTO_H(strtoint, int)
72STRTO_H(strtouint, unsigned int)
73STRTO_H(strtoll, long long)
74STRTO_H(strtoull, unsigned long long)
75
76ssize_t bch_hprint(char *buf, int64_t v)
77{
78 static const char units[] = "?kMGTPEZY";
79 char dec[4] = "";
80 int u, t = 0;
81
82 for (u = 0; v >= 1024 || v <= -1024; u++) {
83 t = v & ~(~0 << 10);
84 v >>= 10;
85 }
86
87 if (!u)
88 return sprintf(buf, "%llu", v);
89
90 if (v < 100 && v > -100)
91 snprintf(dec, sizeof(dec), ".%i", t / 100);
92
93 return sprintf(buf, "%lli%s%c", v, dec, units[u]);
94}
95
96ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
97 size_t selected)
98{
99 char *out = buf;
100 size_t i;
101
102 for (i = 0; list[i]; i++)
103 out += snprintf(out, buf + size - out,
104 i == selected ? "[%s] " : "%s ", list[i]);
105
106 out[-1] = '\n';
107 return out - buf;
108}
109
110ssize_t bch_read_string_list(const char *buf, const char * const list[])
111{
112 size_t i;
113 char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
114 if (!d)
115 return -ENOMEM;
116
117 s = strim(d);
118
119 for (i = 0; list[i]; i++)
120 if (!strcmp(list[i], s))
121 break;
122
123 kfree(d);
124
125 if (!list[i])
126 return -EINVAL;
127
128 return i;
129}
130
131bool bch_is_zero(const char *p, size_t n)
132{
133 size_t i;
134
135 for (i = 0; i < n; i++)
136 if (p[i])
137 return false;
138 return true;
139}
140
141int bch_parse_uuid(const char *s, char *uuid)
142{
143 size_t i, j, x;
144 memset(uuid, 0, 16);
145
146 for (i = 0, j = 0;
147 i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32;
148 i++) {
149 x = s[i] | 32;
150
151 switch (x) {
152 case '0'...'9':
153 x -= '0';
154 break;
155 case 'a'...'f':
156 x -= 'a' - 10;
157 break;
158 default:
159 continue;
160 }
161
162 if (!(j & 1))
163 x <<= 4;
164 uuid[j++ >> 1] |= x;
165 }
166 return i;
167}
168
169void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
170{
171 uint64_t now = local_clock();
172 uint64_t duration = time_after64(now, start_time)
173 ? now - start_time : 0;
174 uint64_t last = time_after64(now, stats->last)
175 ? now - stats->last : 0;
176
177 stats->max_duration = max(stats->max_duration, duration);
178
179 if (stats->last) {
180 ewma_add(stats->average_duration, duration, 8, 8);
181
182 if (stats->average_frequency)
183 ewma_add(stats->average_frequency, last, 8, 8);
184 else
185 stats->average_frequency = last << 8;
186 } else {
187 stats->average_duration = duration << 8;
188 }
189
190 stats->last = now ?: 1;
191}
192
193unsigned bch_next_delay(struct ratelimit *d, uint64_t done)
194{
195 uint64_t now = local_clock();
196
197 d->next += div_u64(done, d->rate);
198
199 return time_after64(d->next, now)
200 ? div_u64(d->next - now, NSEC_PER_SEC / HZ)
201 : 0;
202}
203
204void bch_bio_map(struct bio *bio, void *base)
205{
206 size_t size = bio->bi_size;
207 struct bio_vec *bv = bio->bi_io_vec;
208
209 BUG_ON(!bio->bi_size);
210 BUG_ON(bio->bi_vcnt);
211
212 bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0;
213 goto start;
214
215 for (; size; bio->bi_vcnt++, bv++) {
216 bv->bv_offset = 0;
217start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
218 size);
219 if (base) {
220 bv->bv_page = is_vmalloc_addr(base)
221 ? vmalloc_to_page(base)
222 : virt_to_page(base);
223
224 base += bv->bv_len;
225 }
226
227 size -= bv->bv_len;
228 }
229}
230
231int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp)
232{
233 int i;
234 struct bio_vec *bv;
235
236 bio_for_each_segment(bv, bio, i) {
237 bv->bv_page = alloc_page(gfp);
238 if (!bv->bv_page) {
239 while (bv-- != bio->bi_io_vec + bio->bi_idx)
240 __free_page(bv->bv_page);
241 return -ENOMEM;
242 }
243 }
244
245 return 0;
246}
247
248/*
249 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
250 * use permitted, subject to terms of PostgreSQL license; see.)
251
252 * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
253 * usual sort of implementation. (See Ross Williams' excellent introduction
254 * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
255 * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
256 * If we have no working 64-bit type, then fake it with two 32-bit registers.
257 *
258 * The present implementation is a normal (not "reflected", in Williams'
259 * terms) 64-bit CRC, using initial all-ones register contents and a final
260 * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
261 * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
262 *
263 * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
264 * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
265 * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
266 * x^7 + x^4 + x + 1
267*/
268
269static const uint64_t crc_table[256] = {
270 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
271 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
272 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
273 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
274 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
275 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
276 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
277 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
278 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
279 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
280 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
281 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
282 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
283 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
284 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
285 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
286 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
287 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
288 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
289 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
290 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
291 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
292 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
293 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
294 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
295 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
296 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
297 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
298 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
299 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
300 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
301 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
302 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
303 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
304 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
305 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
306 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
307 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
308 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
309 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
310 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
311 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
312 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
313 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
314 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
315 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
316 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
317 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
318 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
319 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
320 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
321 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
322 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
323 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
324 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
325 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
326 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
327 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
328 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
329 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
330 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
331 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
332 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
333 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
334 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
335 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
336 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
337 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
338 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
339 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
340 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
341 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
342 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
343 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
344 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
345 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
346 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
347 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
348 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
349 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
350 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
351 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
352 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
353 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
354 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
355 0x9AFCE626CE85B507ULL,
356};
357
358uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len)
359{
360 const unsigned char *data = _data;
361
362 while (len--) {
363 int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
364 crc = crc_table[i] ^ (crc << 8);
365 }
366
367 return crc;
368}
369
370uint64_t bch_crc64(const void *data, size_t len)
371{
372 uint64_t crc = 0xffffffffffffffffULL;
373
374 crc = bch_crc64_update(crc, data, len);
375
376 return crc ^ 0xffffffffffffffffULL;
377}
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
new file mode 100644
index 000000000000..577393e38c3a
--- /dev/null
+++ b/drivers/md/bcache/util.h
@@ -0,0 +1,589 @@
1
2#ifndef _BCACHE_UTIL_H
3#define _BCACHE_UTIL_H
4
5#include <linux/errno.h>
6#include <linux/kernel.h>
7#include <linux/llist.h>
8#include <linux/ratelimit.h>
9#include <linux/vmalloc.h>
10#include <linux/workqueue.h>
11
12#include "closure.h"
13
14#define PAGE_SECTORS (PAGE_SIZE / 512)
15
16struct closure;
17
18#include <trace/events/bcache.h>
19
20#ifdef CONFIG_BCACHE_EDEBUG
21
22#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
23#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i)
24
25#else /* EDEBUG */
26
27#define atomic_dec_bug(v) atomic_dec(v)
28#define atomic_inc_bug(v, i) atomic_inc(v)
29
30#endif
31
32#define BITMASK(name, type, field, offset, size) \
33static inline uint64_t name(const type *k) \
34{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \
35 \
36static inline void SET_##name(type *k, uint64_t v) \
37{ \
38 k->field &= ~(~((uint64_t) ~0 << size) << offset); \
39 k->field |= v << offset; \
40}
41
42#define DECLARE_HEAP(type, name) \
43 struct { \
44 size_t size, used; \
45 type *data; \
46 } name
47
48#define init_heap(heap, _size, gfp) \
49({ \
50 size_t _bytes; \
51 (heap)->used = 0; \
52 (heap)->size = (_size); \
53 _bytes = (heap)->size * sizeof(*(heap)->data); \
54 (heap)->data = NULL; \
55 if (_bytes < KMALLOC_MAX_SIZE) \
56 (heap)->data = kmalloc(_bytes, (gfp)); \
57 if ((!(heap)->data) && ((gfp) & GFP_KERNEL)) \
58 (heap)->data = vmalloc(_bytes); \
59 (heap)->data; \
60})
61
62#define free_heap(heap) \
63do { \
64 if (is_vmalloc_addr((heap)->data)) \
65 vfree((heap)->data); \
66 else \
67 kfree((heap)->data); \
68 (heap)->data = NULL; \
69} while (0)
70
71#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j])
72
73#define heap_sift(h, i, cmp) \
74do { \
75 size_t _r, _j = i; \
76 \
77 for (; _j * 2 + 1 < (h)->used; _j = _r) { \
78 _r = _j * 2 + 1; \
79 if (_r + 1 < (h)->used && \
80 cmp((h)->data[_r], (h)->data[_r + 1])) \
81 _r++; \
82 \
83 if (cmp((h)->data[_r], (h)->data[_j])) \
84 break; \
85 heap_swap(h, _r, _j); \
86 } \
87} while (0)
88
89#define heap_sift_down(h, i, cmp) \
90do { \
91 while (i) { \
92 size_t p = (i - 1) / 2; \
93 if (cmp((h)->data[i], (h)->data[p])) \
94 break; \
95 heap_swap(h, i, p); \
96 i = p; \
97 } \
98} while (0)
99
100#define heap_add(h, d, cmp) \
101({ \
102 bool _r = !heap_full(h); \
103 if (_r) { \
104 size_t _i = (h)->used++; \
105 (h)->data[_i] = d; \
106 \
107 heap_sift_down(h, _i, cmp); \
108 heap_sift(h, _i, cmp); \
109 } \
110 _r; \
111})
112
113#define heap_pop(h, d, cmp) \
114({ \
115 bool _r = (h)->used; \
116 if (_r) { \
117 (d) = (h)->data[0]; \
118 (h)->used--; \
119 heap_swap(h, 0, (h)->used); \
120 heap_sift(h, 0, cmp); \
121 } \
122 _r; \
123})
124
125#define heap_peek(h) ((h)->size ? (h)->data[0] : NULL)
126
127#define heap_full(h) ((h)->used == (h)->size)
128
129#define DECLARE_FIFO(type, name) \
130 struct { \
131 size_t front, back, size, mask; \
132 type *data; \
133 } name
134
135#define fifo_for_each(c, fifo, iter) \
136 for (iter = (fifo)->front; \
137 c = (fifo)->data[iter], iter != (fifo)->back; \
138 iter = (iter + 1) & (fifo)->mask)
139
140#define __init_fifo(fifo, gfp) \
141({ \
142 size_t _allocated_size, _bytes; \
143 BUG_ON(!(fifo)->size); \
144 \
145 _allocated_size = roundup_pow_of_two((fifo)->size + 1); \
146 _bytes = _allocated_size * sizeof(*(fifo)->data); \
147 \
148 (fifo)->mask = _allocated_size - 1; \
149 (fifo)->front = (fifo)->back = 0; \
150 (fifo)->data = NULL; \
151 \
152 if (_bytes < KMALLOC_MAX_SIZE) \
153 (fifo)->data = kmalloc(_bytes, (gfp)); \
154 if ((!(fifo)->data) && ((gfp) & GFP_KERNEL)) \
155 (fifo)->data = vmalloc(_bytes); \
156 (fifo)->data; \
157})
158
159#define init_fifo_exact(fifo, _size, gfp) \
160({ \
161 (fifo)->size = (_size); \
162 __init_fifo(fifo, gfp); \
163})
164
165#define init_fifo(fifo, _size, gfp) \
166({ \
167 (fifo)->size = (_size); \
168 if ((fifo)->size > 4) \
169 (fifo)->size = roundup_pow_of_two((fifo)->size) - 1; \
170 __init_fifo(fifo, gfp); \
171})
172
173#define free_fifo(fifo) \
174do { \
175 if (is_vmalloc_addr((fifo)->data)) \
176 vfree((fifo)->data); \
177 else \
178 kfree((fifo)->data); \
179 (fifo)->data = NULL; \
180} while (0)
181
182#define fifo_used(fifo) (((fifo)->back - (fifo)->front) & (fifo)->mask)
183#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo))
184
185#define fifo_empty(fifo) (!fifo_used(fifo))
186#define fifo_full(fifo) (!fifo_free(fifo))
187
188#define fifo_front(fifo) ((fifo)->data[(fifo)->front])
189#define fifo_back(fifo) \
190 ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
191
192#define fifo_idx(fifo, p) (((p) - &fifo_front(fifo)) & (fifo)->mask)
193
194#define fifo_push_back(fifo, i) \
195({ \
196 bool _r = !fifo_full((fifo)); \
197 if (_r) { \
198 (fifo)->data[(fifo)->back++] = (i); \
199 (fifo)->back &= (fifo)->mask; \
200 } \
201 _r; \
202})
203
204#define fifo_pop_front(fifo, i) \
205({ \
206 bool _r = !fifo_empty((fifo)); \
207 if (_r) { \
208 (i) = (fifo)->data[(fifo)->front++]; \
209 (fifo)->front &= (fifo)->mask; \
210 } \
211 _r; \
212})
213
214#define fifo_push_front(fifo, i) \
215({ \
216 bool _r = !fifo_full((fifo)); \
217 if (_r) { \
218 --(fifo)->front; \
219 (fifo)->front &= (fifo)->mask; \
220 (fifo)->data[(fifo)->front] = (i); \
221 } \
222 _r; \
223})
224
225#define fifo_pop_back(fifo, i) \
226({ \
227 bool _r = !fifo_empty((fifo)); \
228 if (_r) { \
229 --(fifo)->back; \
230 (fifo)->back &= (fifo)->mask; \
231 (i) = (fifo)->data[(fifo)->back] \
232 } \
233 _r; \
234})
235
236#define fifo_push(fifo, i) fifo_push_back(fifo, (i))
237#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i))
238
239#define fifo_swap(l, r) \
240do { \
241 swap((l)->front, (r)->front); \
242 swap((l)->back, (r)->back); \
243 swap((l)->size, (r)->size); \
244 swap((l)->mask, (r)->mask); \
245 swap((l)->data, (r)->data); \
246} while (0)
247
248#define fifo_move(dest, src) \
249do { \
250 typeof(*((dest)->data)) _t; \
251 while (!fifo_full(dest) && \
252 fifo_pop(src, _t)) \
253 fifo_push(dest, _t); \
254} while (0)
255
256/*
257 * Simple array based allocator - preallocates a number of elements and you can
258 * never allocate more than that, also has no locking.
259 *
260 * Handy because if you know you only need a fixed number of elements you don't
261 * have to worry about memory allocation failure, and sometimes a mempool isn't
262 * what you want.
263 *
264 * We treat the free elements as entries in a singly linked list, and the
265 * freelist as a stack - allocating and freeing push and pop off the freelist.
266 */
267
268#define DECLARE_ARRAY_ALLOCATOR(type, name, size) \
269 struct { \
270 type *freelist; \
271 type data[size]; \
272 } name
273
274#define array_alloc(array) \
275({ \
276 typeof((array)->freelist) _ret = (array)->freelist; \
277 \
278 if (_ret) \
279 (array)->freelist = *((typeof((array)->freelist) *) _ret);\
280 \
281 _ret; \
282})
283
284#define array_free(array, ptr) \
285do { \
286 typeof((array)->freelist) _ptr = ptr; \
287 \
288 *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \
289 (array)->freelist = _ptr; \
290} while (0)
291
292#define array_allocator_init(array) \
293do { \
294 typeof((array)->freelist) _i; \
295 \
296 BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \
297 (array)->freelist = NULL; \
298 \
299 for (_i = (array)->data; \
300 _i < (array)->data + ARRAY_SIZE((array)->data); \
301 _i++) \
302 array_free(array, _i); \
303} while (0)
304
305#define array_freelist_empty(array) ((array)->freelist == NULL)
306
307#define ANYSINT_MAX(t) \
308 ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
309
310int bch_strtoint_h(const char *, int *);
311int bch_strtouint_h(const char *, unsigned int *);
312int bch_strtoll_h(const char *, long long *);
313int bch_strtoull_h(const char *, unsigned long long *);
314
315static inline int bch_strtol_h(const char *cp, long *res)
316{
317#if BITS_PER_LONG == 32
318 return bch_strtoint_h(cp, (int *) res);
319#else
320 return bch_strtoll_h(cp, (long long *) res);
321#endif
322}
323
324static inline int bch_strtoul_h(const char *cp, long *res)
325{
326#if BITS_PER_LONG == 32
327 return bch_strtouint_h(cp, (unsigned int *) res);
328#else
329 return bch_strtoull_h(cp, (unsigned long long *) res);
330#endif
331}
332
333#define strtoi_h(cp, res) \
334 (__builtin_types_compatible_p(typeof(*res), int) \
335 ? bch_strtoint_h(cp, (void *) res) \
336 : __builtin_types_compatible_p(typeof(*res), long) \
337 ? bch_strtol_h(cp, (void *) res) \
338 : __builtin_types_compatible_p(typeof(*res), long long) \
339 ? bch_strtoll_h(cp, (void *) res) \
340 : __builtin_types_compatible_p(typeof(*res), unsigned int) \
341 ? bch_strtouint_h(cp, (void *) res) \
342 : __builtin_types_compatible_p(typeof(*res), unsigned long) \
343 ? bch_strtoul_h(cp, (void *) res) \
344 : __builtin_types_compatible_p(typeof(*res), unsigned long long)\
345 ? bch_strtoull_h(cp, (void *) res) : -EINVAL)
346
347#define strtoul_safe(cp, var) \
348({ \
349 unsigned long _v; \
350 int _r = kstrtoul(cp, 10, &_v); \
351 if (!_r) \
352 var = _v; \
353 _r; \
354})
355
356#define strtoul_safe_clamp(cp, var, min, max) \
357({ \
358 unsigned long _v; \
359 int _r = kstrtoul(cp, 10, &_v); \
360 if (!_r) \
361 var = clamp_t(typeof(var), _v, min, max); \
362 _r; \
363})
364
365#define snprint(buf, size, var) \
366 snprintf(buf, size, \
367 __builtin_types_compatible_p(typeof(var), int) \
368 ? "%i\n" : \
369 __builtin_types_compatible_p(typeof(var), unsigned) \
370 ? "%u\n" : \
371 __builtin_types_compatible_p(typeof(var), long) \
372 ? "%li\n" : \
373 __builtin_types_compatible_p(typeof(var), unsigned long)\
374 ? "%lu\n" : \
375 __builtin_types_compatible_p(typeof(var), int64_t) \
376 ? "%lli\n" : \
377 __builtin_types_compatible_p(typeof(var), uint64_t) \
378 ? "%llu\n" : \
379 __builtin_types_compatible_p(typeof(var), const char *) \
380 ? "%s\n" : "%i\n", var)
381
382ssize_t bch_hprint(char *buf, int64_t v);
383
384bool bch_is_zero(const char *p, size_t n);
385int bch_parse_uuid(const char *s, char *uuid);
386
387ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
388 size_t selected);
389
390ssize_t bch_read_string_list(const char *buf, const char * const list[]);
391
392struct time_stats {
393 /*
394 * all fields are in nanoseconds, averages are ewmas stored left shifted
395 * by 8
396 */
397 uint64_t max_duration;
398 uint64_t average_duration;
399 uint64_t average_frequency;
400 uint64_t last;
401};
402
403void bch_time_stats_update(struct time_stats *stats, uint64_t time);
404
405#define NSEC_PER_ns 1L
406#define NSEC_PER_us NSEC_PER_USEC
407#define NSEC_PER_ms NSEC_PER_MSEC
408#define NSEC_PER_sec NSEC_PER_SEC
409
410#define __print_time_stat(stats, name, stat, units) \
411 sysfs_print(name ## _ ## stat ## _ ## units, \
412 div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
413
414#define sysfs_print_time_stats(stats, name, \
415 frequency_units, \
416 duration_units) \
417do { \
418 __print_time_stat(stats, name, \
419 average_frequency, frequency_units); \
420 __print_time_stat(stats, name, \
421 average_duration, duration_units); \
422 __print_time_stat(stats, name, \
423 max_duration, duration_units); \
424 \
425 sysfs_print(name ## _last_ ## frequency_units, (stats)->last \
426 ? div_s64(local_clock() - (stats)->last, \
427 NSEC_PER_ ## frequency_units) \
428 : -1LL); \
429} while (0)
430
431#define sysfs_time_stats_attribute(name, \
432 frequency_units, \
433 duration_units) \
434read_attribute(name ## _average_frequency_ ## frequency_units); \
435read_attribute(name ## _average_duration_ ## duration_units); \
436read_attribute(name ## _max_duration_ ## duration_units); \
437read_attribute(name ## _last_ ## frequency_units)
438
439#define sysfs_time_stats_attribute_list(name, \
440 frequency_units, \
441 duration_units) \
442&sysfs_ ## name ## _average_frequency_ ## frequency_units, \
443&sysfs_ ## name ## _average_duration_ ## duration_units, \
444&sysfs_ ## name ## _max_duration_ ## duration_units, \
445&sysfs_ ## name ## _last_ ## frequency_units,
446
447#define ewma_add(ewma, val, weight, factor) \
448({ \
449 (ewma) *= (weight) - 1; \
450 (ewma) += (val) << factor; \
451 (ewma) /= (weight); \
452 (ewma) >> factor; \
453})
454
455struct ratelimit {
456 uint64_t next;
457 unsigned rate;
458};
459
460static inline void ratelimit_reset(struct ratelimit *d)
461{
462 d->next = local_clock();
463}
464
465unsigned bch_next_delay(struct ratelimit *d, uint64_t done);
466
467#define __DIV_SAFE(n, d, zero) \
468({ \
469 typeof(n) _n = (n); \
470 typeof(d) _d = (d); \
471 _d ? _n / _d : zero; \
472})
473
474#define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0)
475
476#define container_of_or_null(ptr, type, member) \
477({ \
478 typeof(ptr) _ptr = ptr; \
479 _ptr ? container_of(_ptr, type, member) : NULL; \
480})
481
482#define RB_INSERT(root, new, member, cmp) \
483({ \
484 __label__ dup; \
485 struct rb_node **n = &(root)->rb_node, *parent = NULL; \
486 typeof(new) this; \
487 int res, ret = -1; \
488 \
489 while (*n) { \
490 parent = *n; \
491 this = container_of(*n, typeof(*(new)), member); \
492 res = cmp(new, this); \
493 if (!res) \
494 goto dup; \
495 n = res < 0 \
496 ? &(*n)->rb_left \
497 : &(*n)->rb_right; \
498 } \
499 \
500 rb_link_node(&(new)->member, parent, n); \
501 rb_insert_color(&(new)->member, root); \
502 ret = 0; \
503dup: \
504 ret; \
505})
506
507#define RB_SEARCH(root, search, member, cmp) \
508({ \
509 struct rb_node *n = (root)->rb_node; \
510 typeof(&(search)) this, ret = NULL; \
511 int res; \
512 \
513 while (n) { \
514 this = container_of(n, typeof(search), member); \
515 res = cmp(&(search), this); \
516 if (!res) { \
517 ret = this; \
518 break; \
519 } \
520 n = res < 0 \
521 ? n->rb_left \
522 : n->rb_right; \
523 } \
524 ret; \
525})
526
527#define RB_GREATER(root, search, member, cmp) \
528({ \
529 struct rb_node *n = (root)->rb_node; \
530 typeof(&(search)) this, ret = NULL; \
531 int res; \
532 \
533 while (n) { \
534 this = container_of(n, typeof(search), member); \
535 res = cmp(&(search), this); \
536 if (res < 0) { \
537 ret = this; \
538 n = n->rb_left; \
539 } else \
540 n = n->rb_right; \
541 } \
542 ret; \
543})
544
545#define RB_FIRST(root, type, member) \
546 container_of_or_null(rb_first(root), type, member)
547
548#define RB_LAST(root, type, member) \
549 container_of_or_null(rb_last(root), type, member)
550
551#define RB_NEXT(ptr, member) \
552 container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
553
554#define RB_PREV(ptr, member) \
555 container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
556
557/* Does linear interpolation between powers of two */
558static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
559{
560 unsigned fract = x & ~(~0 << fract_bits);
561
562 x >>= fract_bits;
563 x = 1 << x;
564 x += (x * fract) >> fract_bits;
565
566 return x;
567}
568
569#define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio))
570
571void bch_bio_map(struct bio *bio, void *base);
572
573int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp);
574
575static inline sector_t bdev_sectors(struct block_device *bdev)
576{
577 return bdev->bd_inode->i_size >> 9;
578}
579
580#define closure_bio_submit(bio, cl, dev) \
581do { \
582 closure_get(cl); \
583 bch_generic_make_request(bio, &(dev)->bio_split_hook); \
584} while (0)
585
586uint64_t bch_crc64_update(uint64_t, const void *, size_t);
587uint64_t bch_crc64(const void *, size_t);
588
589#endif /* _BCACHE_UTIL_H */
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
new file mode 100644
index 000000000000..93e7e31a4bd3
--- /dev/null
+++ b/drivers/md/bcache/writeback.c
@@ -0,0 +1,414 @@
1/*
2 * background writeback - scan btree for dirty data and write it to the backing
3 * device
4 *
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
7 */
8
9#include "bcache.h"
10#include "btree.h"
11#include "debug.h"
12
13static struct workqueue_struct *dirty_wq;
14
15static void read_dirty(struct closure *);
16
17struct dirty_io {
18 struct closure cl;
19 struct cached_dev *dc;
20 struct bio bio;
21};
22
23/* Rate limiting */
24
25static void __update_writeback_rate(struct cached_dev *dc)
26{
27 struct cache_set *c = dc->disk.c;
28 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
29 uint64_t cache_dirty_target =
30 div_u64(cache_sectors * dc->writeback_percent, 100);
31
32 int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
33 c->cached_dev_sectors);
34
35 /* PD controller */
36
37 int change = 0;
38 int64_t error;
39 int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty);
40 int64_t derivative = dirty - dc->disk.sectors_dirty_last;
41
42 dc->disk.sectors_dirty_last = dirty;
43
44 derivative *= dc->writeback_rate_d_term;
45 derivative = clamp(derivative, -dirty, dirty);
46
47 derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
48 dc->writeback_rate_d_smooth, 0);
49
50 /* Avoid divide by zero */
51 if (!target)
52 goto out;
53
54 error = div64_s64((dirty + derivative - target) << 8, target);
55
56 change = div_s64((dc->writeback_rate.rate * error) >> 8,
57 dc->writeback_rate_p_term_inverse);
58
59 /* Don't increase writeback rate if the device isn't keeping up */
60 if (change > 0 &&
61 time_after64(local_clock(),
62 dc->writeback_rate.next + 10 * NSEC_PER_MSEC))
63 change = 0;
64
65 dc->writeback_rate.rate =
66 clamp_t(int64_t, dc->writeback_rate.rate + change,
67 1, NSEC_PER_MSEC);
68out:
69 dc->writeback_rate_derivative = derivative;
70 dc->writeback_rate_change = change;
71 dc->writeback_rate_target = target;
72
73 schedule_delayed_work(&dc->writeback_rate_update,
74 dc->writeback_rate_update_seconds * HZ);
75}
76
77static void update_writeback_rate(struct work_struct *work)
78{
79 struct cached_dev *dc = container_of(to_delayed_work(work),
80 struct cached_dev,
81 writeback_rate_update);
82
83 down_read(&dc->writeback_lock);
84
85 if (atomic_read(&dc->has_dirty) &&
86 dc->writeback_percent)
87 __update_writeback_rate(dc);
88
89 up_read(&dc->writeback_lock);
90}
91
92static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
93{
94 if (atomic_read(&dc->disk.detaching) ||
95 !dc->writeback_percent)
96 return 0;
97
98 return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
99}
100
101/* Background writeback */
102
103static bool dirty_pred(struct keybuf *buf, struct bkey *k)
104{
105 return KEY_DIRTY(k);
106}
107
108static void dirty_init(struct keybuf_key *w)
109{
110 struct dirty_io *io = w->private;
111 struct bio *bio = &io->bio;
112
113 bio_init(bio);
114 if (!io->dc->writeback_percent)
115 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
116
117 bio->bi_size = KEY_SIZE(&w->key) << 9;
118 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
119 bio->bi_private = w;
120 bio->bi_io_vec = bio->bi_inline_vecs;
121 bch_bio_map(bio, NULL);
122}
123
124static void refill_dirty(struct closure *cl)
125{
126 struct cached_dev *dc = container_of(cl, struct cached_dev,
127 writeback.cl);
128 struct keybuf *buf = &dc->writeback_keys;
129 bool searched_from_start = false;
130 struct bkey end = MAX_KEY;
131 SET_KEY_INODE(&end, dc->disk.id);
132
133 if (!atomic_read(&dc->disk.detaching) &&
134 !dc->writeback_running)
135 closure_return(cl);
136
137 down_write(&dc->writeback_lock);
138
139 if (!atomic_read(&dc->has_dirty)) {
140 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
141 bch_write_bdev_super(dc, NULL);
142
143 up_write(&dc->writeback_lock);
144 closure_return(cl);
145 }
146
147 if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
148 buf->last_scanned = KEY(dc->disk.id, 0, 0);
149 searched_from_start = true;
150 }
151
152 bch_refill_keybuf(dc->disk.c, buf, &end);
153
154 if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
155 /* Searched the entire btree - delay awhile */
156
157 if (RB_EMPTY_ROOT(&buf->keys)) {
158 atomic_set(&dc->has_dirty, 0);
159 cached_dev_put(dc);
160 }
161
162 if (!atomic_read(&dc->disk.detaching))
163 closure_delay(&dc->writeback, dc->writeback_delay * HZ);
164 }
165
166 up_write(&dc->writeback_lock);
167
168 ratelimit_reset(&dc->writeback_rate);
169
170 /* Punt to workqueue only so we don't recurse and blow the stack */
171 continue_at(cl, read_dirty, dirty_wq);
172}
173
174void bch_writeback_queue(struct cached_dev *dc)
175{
176 if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) {
177 if (!atomic_read(&dc->disk.detaching))
178 closure_delay(&dc->writeback, dc->writeback_delay * HZ);
179
180 continue_at(&dc->writeback.cl, refill_dirty, dirty_wq);
181 }
182}
183
184void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
185{
186 atomic_long_add(sectors, &dc->disk.sectors_dirty);
187
188 if (!atomic_read(&dc->has_dirty) &&
189 !atomic_xchg(&dc->has_dirty, 1)) {
190 atomic_inc(&dc->count);
191
192 if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
193 SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
194 /* XXX: should do this synchronously */
195 bch_write_bdev_super(dc, NULL);
196 }
197
198 bch_writeback_queue(dc);
199
200 if (dc->writeback_percent)
201 schedule_delayed_work(&dc->writeback_rate_update,
202 dc->writeback_rate_update_seconds * HZ);
203 }
204}
205
206/* Background writeback - IO loop */
207
208static void dirty_io_destructor(struct closure *cl)
209{
210 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
211 kfree(io);
212}
213
214static void write_dirty_finish(struct closure *cl)
215{
216 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
217 struct keybuf_key *w = io->bio.bi_private;
218 struct cached_dev *dc = io->dc;
219 struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt);
220
221 while (bv-- != io->bio.bi_io_vec)
222 __free_page(bv->bv_page);
223
224 /* This is kind of a dumb way of signalling errors. */
225 if (KEY_DIRTY(&w->key)) {
226 unsigned i;
227 struct btree_op op;
228 bch_btree_op_init_stack(&op);
229
230 op.type = BTREE_REPLACE;
231 bkey_copy(&op.replace, &w->key);
232
233 SET_KEY_DIRTY(&w->key, false);
234 bch_keylist_add(&op.keys, &w->key);
235
236 for (i = 0; i < KEY_PTRS(&w->key); i++)
237 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
238
239 pr_debug("clearing %s", pkey(&w->key));
240 bch_btree_insert(&op, dc->disk.c);
241 closure_sync(&op.cl);
242
243 atomic_long_inc(op.insert_collision
244 ? &dc->disk.c->writeback_keys_failed
245 : &dc->disk.c->writeback_keys_done);
246 }
247
248 bch_keybuf_del(&dc->writeback_keys, w);
249 atomic_dec_bug(&dc->in_flight);
250
251 closure_wake_up(&dc->writeback_wait);
252
253 closure_return_with_destructor(cl, dirty_io_destructor);
254}
255
256static void dirty_endio(struct bio *bio, int error)
257{
258 struct keybuf_key *w = bio->bi_private;
259 struct dirty_io *io = w->private;
260
261 if (error)
262 SET_KEY_DIRTY(&w->key, false);
263
264 closure_put(&io->cl);
265}
266
267static void write_dirty(struct closure *cl)
268{
269 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
270 struct keybuf_key *w = io->bio.bi_private;
271
272 dirty_init(w);
273 io->bio.bi_rw = WRITE;
274 io->bio.bi_sector = KEY_START(&w->key);
275 io->bio.bi_bdev = io->dc->bdev;
276 io->bio.bi_end_io = dirty_endio;
277
278 trace_bcache_write_dirty(&io->bio);
279 closure_bio_submit(&io->bio, cl, &io->dc->disk);
280
281 continue_at(cl, write_dirty_finish, dirty_wq);
282}
283
284static void read_dirty_endio(struct bio *bio, int error)
285{
286 struct keybuf_key *w = bio->bi_private;
287 struct dirty_io *io = w->private;
288
289 bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
290 error, "reading dirty data from cache");
291
292 dirty_endio(bio, error);
293}
294
295static void read_dirty_submit(struct closure *cl)
296{
297 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
298
299 trace_bcache_read_dirty(&io->bio);
300 closure_bio_submit(&io->bio, cl, &io->dc->disk);
301
302 continue_at(cl, write_dirty, dirty_wq);
303}
304
305static void read_dirty(struct closure *cl)
306{
307 struct cached_dev *dc = container_of(cl, struct cached_dev,
308 writeback.cl);
309 unsigned delay = writeback_delay(dc, 0);
310 struct keybuf_key *w;
311 struct dirty_io *io;
312
313 /*
314 * XXX: if we error, background writeback just spins. Should use some
315 * mempools.
316 */
317
318 while (1) {
319 w = bch_keybuf_next(&dc->writeback_keys);
320 if (!w)
321 break;
322
323 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
324
325 if (delay > 0 &&
326 (KEY_START(&w->key) != dc->last_read ||
327 jiffies_to_msecs(delay) > 50)) {
328 w->private = NULL;
329
330 closure_delay(&dc->writeback, delay);
331 continue_at(cl, read_dirty, dirty_wq);
332 }
333
334 dc->last_read = KEY_OFFSET(&w->key);
335
336 io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
337 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
338 GFP_KERNEL);
339 if (!io)
340 goto err;
341
342 w->private = io;
343 io->dc = dc;
344
345 dirty_init(w);
346 io->bio.bi_sector = PTR_OFFSET(&w->key, 0);
347 io->bio.bi_bdev = PTR_CACHE(dc->disk.c,
348 &w->key, 0)->bdev;
349 io->bio.bi_rw = READ;
350 io->bio.bi_end_io = read_dirty_endio;
351
352 if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
353 goto err_free;
354
355 pr_debug("%s", pkey(&w->key));
356
357 closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
358
359 delay = writeback_delay(dc, KEY_SIZE(&w->key));
360
361 atomic_inc(&dc->in_flight);
362
363 if (!closure_wait_event(&dc->writeback_wait, cl,
364 atomic_read(&dc->in_flight) < 64))
365 continue_at(cl, read_dirty, dirty_wq);
366 }
367
368 if (0) {
369err_free:
370 kfree(w->private);
371err:
372 bch_keybuf_del(&dc->writeback_keys, w);
373 }
374
375 refill_dirty(cl);
376}
377
378void bch_writeback_init_cached_dev(struct cached_dev *dc)
379{
380 closure_init_unlocked(&dc->writeback);
381 init_rwsem(&dc->writeback_lock);
382
383 bch_keybuf_init(&dc->writeback_keys, dirty_pred);
384
385 dc->writeback_metadata = true;
386 dc->writeback_running = true;
387 dc->writeback_percent = 10;
388 dc->writeback_delay = 30;
389 dc->writeback_rate.rate = 1024;
390
391 dc->writeback_rate_update_seconds = 30;
392 dc->writeback_rate_d_term = 16;
393 dc->writeback_rate_p_term_inverse = 64;
394 dc->writeback_rate_d_smooth = 8;
395
396 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
397 schedule_delayed_work(&dc->writeback_rate_update,
398 dc->writeback_rate_update_seconds * HZ);
399}
400
401void bch_writeback_exit(void)
402{
403 if (dirty_wq)
404 destroy_workqueue(dirty_wq);
405}
406
407int __init bch_writeback_init(void)
408{
409 dirty_wq = create_singlethread_workqueue("bcache_writeback");
410 if (!dirty_wq)
411 return -ENOMEM;
412
413 return 0;
414}
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index f204a7a9cf38..6e7ec64b69ab 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -78,3 +78,9 @@ SUBSYS(hugetlb)
78#endif 78#endif
79 79
80/* */ 80/* */
81
82#ifdef CONFIG_CGROUP_BCACHE
83SUBSYS(bcache)
84#endif
85
86/* */
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 0c5a18ec322c..1b4d4ee1168f 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -52,7 +52,7 @@
52#endif 52#endif
53 53
54extern const char *drbd_buildtag(void); 54extern const char *drbd_buildtag(void);
55#define REL_VERSION "8.4.2" 55#define REL_VERSION "8.4.3"
56#define API_VERSION 1 56#define API_VERSION 1
57#define PRO_VERSION_MIN 86 57#define PRO_VERSION_MIN 86
58#define PRO_VERSION_MAX 101 58#define PRO_VERSION_MAX 101
@@ -319,7 +319,8 @@ enum drbd_state_rv {
319 SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ 319 SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */
320 SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ 320 SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */
321 SS_O_VOL_PEER_PRI = -20, 321 SS_O_VOL_PEER_PRI = -20,
322 SS_AFTER_LAST_ERROR = -21, /* Keep this at bottom */ 322 SS_OUTDATE_WO_CONN = -21,
323 SS_AFTER_LAST_ERROR = -22, /* Keep this at bottom */
323}; 324};
324 325
325/* from drbd_strings.c */ 326/* from drbd_strings.c */
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 1fa19c5f5e64..1fedf2b17cc8 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -126,13 +126,12 @@
126#define DRBD_RESYNC_RATE_DEF 250 126#define DRBD_RESYNC_RATE_DEF 250
127#define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */ 127#define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */
128 128
129 /* less than 7 would hit performance unnecessarily. 129 /* less than 7 would hit performance unnecessarily. */
130 * 919 slots context information per transaction,
131 * 32k activity log, 4k transaction size,
132 * one transaction in flight:
133 * 919 * 7 = 6433 */
134#define DRBD_AL_EXTENTS_MIN 7 130#define DRBD_AL_EXTENTS_MIN 7
135#define DRBD_AL_EXTENTS_MAX 6433 131 /* we use u16 as "slot number", (u16)~0 is "FREE".
132 * If you use >= 292 kB on-disk ring buffer,
133 * this is the maximum you can use: */
134#define DRBD_AL_EXTENTS_MAX 0xfffe
136#define DRBD_AL_EXTENTS_DEF 1237 135#define DRBD_AL_EXTENTS_DEF 1237
137#define DRBD_AL_EXTENTS_SCALE '1' 136#define DRBD_AL_EXTENTS_SCALE '1'
138 137
diff --git a/include/linux/idr.h b/include/linux/idr.h
index a470ac3ef49d..871a213a8477 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -124,11 +124,13 @@ static inline void *idr_find(struct idr *idr, int id)
124 * @idp: idr handle 124 * @idp: idr handle
125 * @entry: the type * to use as cursor 125 * @entry: the type * to use as cursor
126 * @id: id entry's key 126 * @id: id entry's key
127 *
128 * @entry and @id do not need to be initialized before the loop, and
129 * after normal terminatinon @entry is left with the value NULL. This
130 * is convenient for a "not found" value.
127 */ 131 */
128#define idr_for_each_entry(idp, entry, id) \ 132#define idr_for_each_entry(idp, entry, id) \
129 for (id = 0, entry = (typeof(entry))idr_get_next((idp), &(id)); \ 133 for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id)
130 entry != NULL; \
131 ++id, entry = (typeof(entry))idr_get_next((idp), &(id)))
132 134
133/* 135/*
134 * Don't use the following functions. These exist only to suppress 136 * Don't use the following functions. These exist only to suppress
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
index 4019013c6593..46262284de47 100644
--- a/include/linux/lru_cache.h
+++ b/include/linux/lru_cache.h
@@ -256,6 +256,7 @@ extern void lc_destroy(struct lru_cache *lc);
256extern void lc_set(struct lru_cache *lc, unsigned int enr, int index); 256extern void lc_set(struct lru_cache *lc, unsigned int enr, int index);
257extern void lc_del(struct lru_cache *lc, struct lc_element *element); 257extern void lc_del(struct lru_cache *lc, struct lc_element *element);
258 258
259extern struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr);
259extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr); 260extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
260extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); 261extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
261extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); 262extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8da67d625e13..0616ffe45702 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -133,10 +133,20 @@ do { \
133 _down_write_nest_lock(sem, &(nest_lock)->dep_map); \ 133 _down_write_nest_lock(sem, &(nest_lock)->dep_map); \
134} while (0); 134} while (0);
135 135
136/*
137 * Take/release a lock when not the owner will release it.
138 *
139 * [ This API should be avoided as much as possible - the
140 * proper abstraction for this case is completions. ]
141 */
142extern void down_read_non_owner(struct rw_semaphore *sem);
143extern void up_read_non_owner(struct rw_semaphore *sem);
136#else 144#else
137# define down_read_nested(sem, subclass) down_read(sem) 145# define down_read_nested(sem, subclass) down_read(sem)
138# define down_write_nest_lock(sem, nest_lock) down_write(sem) 146# define down_write_nest_lock(sem, nest_lock) down_write(sem)
139# define down_write_nested(sem, subclass) down_write(sem) 147# define down_write_nested(sem, subclass) down_write(sem)
148# define down_read_non_owner(sem) down_read(sem)
149# define up_read_non_owner(sem) up_read(sem)
140#endif 150#endif
141 151
142#endif /* _LINUX_RWSEM_H */ 152#endif /* _LINUX_RWSEM_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 022c085ac3c5..caa8f4d0186b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1411,6 +1411,10 @@ struct task_struct {
1411#ifdef CONFIG_UPROBES 1411#ifdef CONFIG_UPROBES
1412 struct uprobe_task *utask; 1412 struct uprobe_task *utask;
1413#endif 1413#endif
1414#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
1415 unsigned int sequential_io;
1416 unsigned int sequential_io_avg;
1417#endif
1414}; 1418};
1415 1419
1416/* Future-safe accessor for struct task_struct's cpus_allowed. */ 1420/* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
new file mode 100644
index 000000000000..3cc5a0b278c3
--- /dev/null
+++ b/include/trace/events/bcache.h
@@ -0,0 +1,271 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM bcache
3
4#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_BCACHE_H
6
7#include <linux/tracepoint.h>
8
9struct search;
10
11DECLARE_EVENT_CLASS(bcache_request,
12
13 TP_PROTO(struct search *s, struct bio *bio),
14
15 TP_ARGS(s, bio),
16
17 TP_STRUCT__entry(
18 __field(dev_t, dev )
19 __field(unsigned int, orig_major )
20 __field(unsigned int, orig_minor )
21 __field(sector_t, sector )
22 __field(dev_t, orig_sector )
23 __field(unsigned int, nr_sector )
24 __array(char, rwbs, 6 )
25 __array(char, comm, TASK_COMM_LEN )
26 ),
27
28 TP_fast_assign(
29 __entry->dev = bio->bi_bdev->bd_dev;
30 __entry->orig_major = s->d->disk->major;
31 __entry->orig_minor = s->d->disk->first_minor;
32 __entry->sector = bio->bi_sector;
33 __entry->orig_sector = bio->bi_sector - 16;
34 __entry->nr_sector = bio->bi_size >> 9;
35 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
36 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
37 ),
38
39 TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d @ %llu)",
40 MAJOR(__entry->dev), MINOR(__entry->dev),
41 __entry->rwbs,
42 (unsigned long long)__entry->sector,
43 __entry->nr_sector, __entry->comm,
44 __entry->orig_major, __entry->orig_minor,
45 (unsigned long long)__entry->orig_sector)
46);
47
48DEFINE_EVENT(bcache_request, bcache_request_start,
49
50 TP_PROTO(struct search *s, struct bio *bio),
51
52 TP_ARGS(s, bio)
53);
54
55DEFINE_EVENT(bcache_request, bcache_request_end,
56
57 TP_PROTO(struct search *s, struct bio *bio),
58
59 TP_ARGS(s, bio)
60);
61
62DECLARE_EVENT_CLASS(bcache_bio,
63
64 TP_PROTO(struct bio *bio),
65
66 TP_ARGS(bio),
67
68 TP_STRUCT__entry(
69 __field(dev_t, dev )
70 __field(sector_t, sector )
71 __field(unsigned int, nr_sector )
72 __array(char, rwbs, 6 )
73 __array(char, comm, TASK_COMM_LEN )
74 ),
75
76 TP_fast_assign(
77 __entry->dev = bio->bi_bdev->bd_dev;
78 __entry->sector = bio->bi_sector;
79 __entry->nr_sector = bio->bi_size >> 9;
80 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
81 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
82 ),
83
84 TP_printk("%d,%d %s %llu + %u [%s]",
85 MAJOR(__entry->dev), MINOR(__entry->dev),
86 __entry->rwbs,
87 (unsigned long long)__entry->sector,
88 __entry->nr_sector, __entry->comm)
89);
90
91
92DEFINE_EVENT(bcache_bio, bcache_passthrough,
93
94 TP_PROTO(struct bio *bio),
95
96 TP_ARGS(bio)
97);
98
99DEFINE_EVENT(bcache_bio, bcache_cache_hit,
100
101 TP_PROTO(struct bio *bio),
102
103 TP_ARGS(bio)
104);
105
106DEFINE_EVENT(bcache_bio, bcache_cache_miss,
107
108 TP_PROTO(struct bio *bio),
109
110 TP_ARGS(bio)
111);
112
113DEFINE_EVENT(bcache_bio, bcache_read_retry,
114
115 TP_PROTO(struct bio *bio),
116
117 TP_ARGS(bio)
118);
119
120DEFINE_EVENT(bcache_bio, bcache_writethrough,
121
122 TP_PROTO(struct bio *bio),
123
124 TP_ARGS(bio)
125);
126
127DEFINE_EVENT(bcache_bio, bcache_writeback,
128
129 TP_PROTO(struct bio *bio),
130
131 TP_ARGS(bio)
132);
133
134DEFINE_EVENT(bcache_bio, bcache_write_skip,
135
136 TP_PROTO(struct bio *bio),
137
138 TP_ARGS(bio)
139);
140
141DEFINE_EVENT(bcache_bio, bcache_btree_read,
142
143 TP_PROTO(struct bio *bio),
144
145 TP_ARGS(bio)
146);
147
148DEFINE_EVENT(bcache_bio, bcache_btree_write,
149
150 TP_PROTO(struct bio *bio),
151
152 TP_ARGS(bio)
153);
154
155DEFINE_EVENT(bcache_bio, bcache_write_dirty,
156
157 TP_PROTO(struct bio *bio),
158
159 TP_ARGS(bio)
160);
161
162DEFINE_EVENT(bcache_bio, bcache_read_dirty,
163
164 TP_PROTO(struct bio *bio),
165
166 TP_ARGS(bio)
167);
168
169DEFINE_EVENT(bcache_bio, bcache_write_moving,
170
171 TP_PROTO(struct bio *bio),
172
173 TP_ARGS(bio)
174);
175
176DEFINE_EVENT(bcache_bio, bcache_read_moving,
177
178 TP_PROTO(struct bio *bio),
179
180 TP_ARGS(bio)
181);
182
183DEFINE_EVENT(bcache_bio, bcache_journal_write,
184
185 TP_PROTO(struct bio *bio),
186
187 TP_ARGS(bio)
188);
189
190DECLARE_EVENT_CLASS(bcache_cache_bio,
191
192 TP_PROTO(struct bio *bio,
193 sector_t orig_sector,
194 struct block_device* orig_bdev),
195
196 TP_ARGS(bio, orig_sector, orig_bdev),
197
198 TP_STRUCT__entry(
199 __field(dev_t, dev )
200 __field(dev_t, orig_dev )
201 __field(sector_t, sector )
202 __field(sector_t, orig_sector )
203 __field(unsigned int, nr_sector )
204 __array(char, rwbs, 6 )
205 __array(char, comm, TASK_COMM_LEN )
206 ),
207
208 TP_fast_assign(
209 __entry->dev = bio->bi_bdev->bd_dev;
210 __entry->orig_dev = orig_bdev->bd_dev;
211 __entry->sector = bio->bi_sector;
212 __entry->orig_sector = orig_sector;
213 __entry->nr_sector = bio->bi_size >> 9;
214 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
215 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
216 ),
217
218 TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d %llu)",
219 MAJOR(__entry->dev), MINOR(__entry->dev),
220 __entry->rwbs,
221 (unsigned long long)__entry->sector,
222 __entry->nr_sector, __entry->comm,
223 MAJOR(__entry->orig_dev), MINOR(__entry->orig_dev),
224 (unsigned long long)__entry->orig_sector)
225);
226
227DEFINE_EVENT(bcache_cache_bio, bcache_cache_insert,
228
229 TP_PROTO(struct bio *bio,
230 sector_t orig_sector,
231 struct block_device *orig_bdev),
232
233 TP_ARGS(bio, orig_sector, orig_bdev)
234);
235
236DECLARE_EVENT_CLASS(bcache_gc,
237
238 TP_PROTO(uint8_t *uuid),
239
240 TP_ARGS(uuid),
241
242 TP_STRUCT__entry(
243 __field(uint8_t *, uuid)
244 ),
245
246 TP_fast_assign(
247 __entry->uuid = uuid;
248 ),
249
250 TP_printk("%pU", __entry->uuid)
251);
252
253
254DEFINE_EVENT(bcache_gc, bcache_gc_start,
255
256 TP_PROTO(uint8_t *uuid),
257
258 TP_ARGS(uuid)
259);
260
261DEFINE_EVENT(bcache_gc, bcache_gc_end,
262
263 TP_PROTO(uint8_t *uuid),
264
265 TP_ARGS(uuid)
266);
267
268#endif /* _TRACE_BCACHE_H */
269
270/* This part must be outside protection */
271#include <trace/define_trace.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index c509cc4a0d53..987b28a1f01b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1304,6 +1304,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1304 p->memcg_batch.do_batch = 0; 1304 p->memcg_batch.do_batch = 0;
1305 p->memcg_batch.memcg = NULL; 1305 p->memcg_batch.memcg = NULL;
1306#endif 1306#endif
1307#ifdef CONFIG_BCACHE
1308 p->sequential_io = 0;
1309 p->sequential_io_avg = 0;
1310#endif
1307 1311
1308 /* Perform scheduler related setup. Assign this task to a CPU. */ 1312 /* Perform scheduler related setup. Assign this task to a CPU. */
1309 sched_fork(p); 1313 sched_fork(p);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 6a3bccba7e7d..1f3186b37fd5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2998,6 +2998,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2998EXPORT_SYMBOL_GPL(lockdep_init_map); 2998EXPORT_SYMBOL_GPL(lockdep_init_map);
2999 2999
3000struct lock_class_key __lockdep_no_validate__; 3000struct lock_class_key __lockdep_no_validate__;
3001EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
3001 3002
3002static int 3003static int
3003print_lock_nested_lock_not_held(struct task_struct *curr, 3004print_lock_nested_lock_not_held(struct task_struct *curr,
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index b3c6c3fcd847..cfff1435bdfb 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -126,6 +126,15 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
126 126
127EXPORT_SYMBOL(_down_write_nest_lock); 127EXPORT_SYMBOL(_down_write_nest_lock);
128 128
129void down_read_non_owner(struct rw_semaphore *sem)
130{
131 might_sleep();
132
133 __down_read(sem);
134}
135
136EXPORT_SYMBOL(down_read_non_owner);
137
129void down_write_nested(struct rw_semaphore *sem, int subclass) 138void down_write_nested(struct rw_semaphore *sem, int subclass)
130{ 139{
131 might_sleep(); 140 might_sleep();
@@ -136,6 +145,13 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
136 145
137EXPORT_SYMBOL(down_write_nested); 146EXPORT_SYMBOL(down_write_nested);
138 147
148void up_read_non_owner(struct rw_semaphore *sem)
149{
150 __up_read(sem);
151}
152
153EXPORT_SYMBOL(up_read_non_owner);
154
139#endif 155#endif
140 156
141 157
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index ed58a3216a6d..b8b8560bfb95 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1808,6 +1808,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1808 1808
1809 rwbs[i] = '\0'; 1809 rwbs[i] = '\0';
1810} 1810}
1811EXPORT_SYMBOL_GPL(blk_fill_rwbs);
1811 1812
1812#endif /* CONFIG_EVENT_TRACING */ 1813#endif /* CONFIG_EVENT_TRACING */
1813 1814
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
index 8335d39d2ccd..4a83ecd03650 100644
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
@@ -365,7 +365,13 @@ static int lc_unused_element_available(struct lru_cache *lc)
365 return 0; 365 return 0;
366} 366}
367 367
368static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change) 368/* used as internal flags to __lc_get */
369enum {
370 LC_GET_MAY_CHANGE = 1,
371 LC_GET_MAY_USE_UNCOMMITTED = 2,
372};
373
374static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, unsigned int flags)
369{ 375{
370 struct lc_element *e; 376 struct lc_element *e;
371 377
@@ -380,22 +386,31 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool
380 * this enr is currently being pulled in already, 386 * this enr is currently being pulled in already,
381 * and will be available once the pending transaction 387 * and will be available once the pending transaction
382 * has been committed. */ 388 * has been committed. */
383 if (e && e->lc_new_number == e->lc_number) { 389 if (e) {
390 if (e->lc_new_number != e->lc_number) {
391 /* It has been found above, but on the "to_be_changed"
392 * list, not yet committed. Don't pull it in twice,
393 * wait for the transaction, then try again...
394 */
395 if (!(flags & LC_GET_MAY_USE_UNCOMMITTED))
396 RETURN(NULL);
397 /* ... unless the caller is aware of the implications,
398 * probably preparing a cumulative transaction. */
399 ++e->refcnt;
400 ++lc->hits;
401 RETURN(e);
402 }
403 /* else: lc_new_number == lc_number; a real hit. */
384 ++lc->hits; 404 ++lc->hits;
385 if (e->refcnt++ == 0) 405 if (e->refcnt++ == 0)
386 lc->used++; 406 lc->used++;
387 list_move(&e->list, &lc->in_use); /* Not evictable... */ 407 list_move(&e->list, &lc->in_use); /* Not evictable... */
388 RETURN(e); 408 RETURN(e);
389 } 409 }
410 /* e == NULL */
390 411
391 ++lc->misses; 412 ++lc->misses;
392 if (!may_change) 413 if (!(flags & LC_GET_MAY_CHANGE))
393 RETURN(NULL);
394
395 /* It has been found above, but on the "to_be_changed" list, not yet
396 * committed. Don't pull it in twice, wait for the transaction, then
397 * try again */
398 if (e)
399 RETURN(NULL); 414 RETURN(NULL);
400 415
401 /* To avoid races with lc_try_lock(), first, mark us dirty 416 /* To avoid races with lc_try_lock(), first, mark us dirty
@@ -477,7 +492,27 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool
477 */ 492 */
478struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) 493struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
479{ 494{
480 return __lc_get(lc, enr, 1); 495 return __lc_get(lc, enr, LC_GET_MAY_CHANGE);
496}
497
498/**
499 * lc_get_cumulative - like lc_get; also finds to-be-changed elements
500 * @lc: the lru cache to operate on
501 * @enr: the label to look up
502 *
503 * Unlike lc_get this also returns the element for @enr, if it is belonging to
504 * a pending transaction, so the return values are like for lc_get(),
505 * plus:
506 *
507 * pointer to an element already on the "to_be_changed" list.
508 * In this case, the cache was already marked %LC_DIRTY.
509 *
510 * Caller needs to make sure that the pending transaction is completed,
511 * before proceeding to actually use this element.
512 */
513struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr)
514{
515 return __lc_get(lc, enr, LC_GET_MAY_CHANGE|LC_GET_MAY_USE_UNCOMMITTED);
481} 516}
482 517
483/** 518/**
@@ -648,3 +683,4 @@ EXPORT_SYMBOL(lc_seq_printf_stats);
648EXPORT_SYMBOL(lc_seq_dump_details); 683EXPORT_SYMBOL(lc_seq_dump_details);
649EXPORT_SYMBOL(lc_try_lock); 684EXPORT_SYMBOL(lc_try_lock);
650EXPORT_SYMBOL(lc_is_used); 685EXPORT_SYMBOL(lc_is_used);
686EXPORT_SYMBOL(lc_get_cumulative);