diff options
62 files changed, 17681 insertions, 372 deletions
diff --git a/Documentation/ABI/testing/sysfs-block-bcache b/Documentation/ABI/testing/sysfs-block-bcache new file mode 100644 index 000000000000..9e4bbc5d51fd --- /dev/null +++ b/Documentation/ABI/testing/sysfs-block-bcache | |||
@@ -0,0 +1,156 @@ | |||
1 | What: /sys/block/<disk>/bcache/unregister | ||
2 | Date: November 2010 | ||
3 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
4 | Description: | ||
5 | A write to this file causes the backing device or cache to be | ||
6 | unregistered. If a backing device had dirty data in the cache, | ||
7 | writeback mode is automatically disabled and all dirty data is | ||
8 | flushed before the device is unregistered. Caches unregister | ||
9 | all associated backing devices before unregistering themselves. | ||
10 | |||
11 | What: /sys/block/<disk>/bcache/clear_stats | ||
12 | Date: November 2010 | ||
13 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
14 | Description: | ||
15 | Writing to this file resets all the statistics for the device. | ||
16 | |||
17 | What: /sys/block/<disk>/bcache/cache | ||
18 | Date: November 2010 | ||
19 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
20 | Description: | ||
21 | For a backing device that has cache, a symlink to | ||
22 | the bcache/ dir of that cache. | ||
23 | |||
24 | What: /sys/block/<disk>/bcache/cache_hits | ||
25 | Date: November 2010 | ||
26 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
27 | Description: | ||
28 | For backing devices: integer number of full cache hits, | ||
29 | counted per bio. A partial cache hit counts as a miss. | ||
30 | |||
31 | What: /sys/block/<disk>/bcache/cache_misses | ||
32 | Date: November 2010 | ||
33 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
34 | Description: | ||
35 | For backing devices: integer number of cache misses. | ||
36 | |||
37 | What: /sys/block/<disk>/bcache/cache_hit_ratio | ||
38 | Date: November 2010 | ||
39 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
40 | Description: | ||
41 | For backing devices: cache hits as a percentage. | ||
42 | |||
43 | What: /sys/block/<disk>/bcache/sequential_cutoff | ||
44 | Date: November 2010 | ||
45 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
46 | Description: | ||
47 | For backing devices: Threshold past which sequential IO will | ||
48 | skip the cache. Read and written as bytes in human readable | ||
49 | units (i.e. echo 10M > sequntial_cutoff). | ||
50 | |||
51 | What: /sys/block/<disk>/bcache/bypassed | ||
52 | Date: November 2010 | ||
53 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
54 | Description: | ||
55 | Sum of all reads and writes that have bypassed the cache (due | ||
56 | to the sequential cutoff). Expressed as bytes in human | ||
57 | readable units. | ||
58 | |||
59 | What: /sys/block/<disk>/bcache/writeback | ||
60 | Date: November 2010 | ||
61 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
62 | Description: | ||
63 | For backing devices: When on, writeback caching is enabled and | ||
64 | writes will be buffered in the cache. When off, caching is in | ||
65 | writethrough mode; reads and writes will be added to the | ||
66 | cache but no write buffering will take place. | ||
67 | |||
68 | What: /sys/block/<disk>/bcache/writeback_running | ||
69 | Date: November 2010 | ||
70 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
71 | Description: | ||
72 | For backing devices: when off, dirty data will not be written | ||
73 | from the cache to the backing device. The cache will still be | ||
74 | used to buffer writes until it is mostly full, at which point | ||
75 | writes transparently revert to writethrough mode. Intended only | ||
76 | for benchmarking/testing. | ||
77 | |||
78 | What: /sys/block/<disk>/bcache/writeback_delay | ||
79 | Date: November 2010 | ||
80 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
81 | Description: | ||
82 | For backing devices: In writeback mode, when dirty data is | ||
83 | written to the cache and the cache held no dirty data for that | ||
84 | backing device, writeback from cache to backing device starts | ||
85 | after this delay, expressed as an integer number of seconds. | ||
86 | |||
87 | What: /sys/block/<disk>/bcache/writeback_percent | ||
88 | Date: November 2010 | ||
89 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
90 | Description: | ||
91 | For backing devices: If nonzero, writeback from cache to | ||
92 | backing device only takes place when more than this percentage | ||
93 | of the cache is used, allowing more write coalescing to take | ||
94 | place and reducing total number of writes sent to the backing | ||
95 | device. Integer between 0 and 40. | ||
96 | |||
97 | What: /sys/block/<disk>/bcache/synchronous | ||
98 | Date: November 2010 | ||
99 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
100 | Description: | ||
101 | For a cache, a boolean that allows synchronous mode to be | ||
102 | switched on and off. In synchronous mode all writes are ordered | ||
103 | such that the cache can reliably recover from unclean shutdown; | ||
104 | if disabled bcache will not generally wait for writes to | ||
105 | complete but if the cache is not shut down cleanly all data | ||
106 | will be discarded from the cache. Should not be turned off with | ||
107 | writeback caching enabled. | ||
108 | |||
109 | What: /sys/block/<disk>/bcache/discard | ||
110 | Date: November 2010 | ||
111 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
112 | Description: | ||
113 | For a cache, a boolean allowing discard/TRIM to be turned off | ||
114 | or back on if the device supports it. | ||
115 | |||
116 | What: /sys/block/<disk>/bcache/bucket_size | ||
117 | Date: November 2010 | ||
118 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
119 | Description: | ||
120 | For a cache, bucket size in human readable units, as set at | ||
121 | cache creation time; should match the erase block size of the | ||
122 | SSD for optimal performance. | ||
123 | |||
124 | What: /sys/block/<disk>/bcache/nbuckets | ||
125 | Date: November 2010 | ||
126 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
127 | Description: | ||
128 | For a cache, the number of usable buckets. | ||
129 | |||
130 | What: /sys/block/<disk>/bcache/tree_depth | ||
131 | Date: November 2010 | ||
132 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
133 | Description: | ||
134 | For a cache, height of the btree excluding leaf nodes (i.e. a | ||
135 | one node tree will have a depth of 0). | ||
136 | |||
137 | What: /sys/block/<disk>/bcache/btree_cache_size | ||
138 | Date: November 2010 | ||
139 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
140 | Description: | ||
141 | Number of btree buckets/nodes that are currently cached in | ||
142 | memory; cache dynamically grows and shrinks in response to | ||
143 | memory pressure from the rest of the system. | ||
144 | |||
145 | What: /sys/block/<disk>/bcache/written | ||
146 | Date: November 2010 | ||
147 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
148 | Description: | ||
149 | For a cache, total amount of data in human readable units | ||
150 | written to the cache, excluding all metadata. | ||
151 | |||
152 | What: /sys/block/<disk>/bcache/btree_written | ||
153 | Date: November 2010 | ||
154 | Contact: Kent Overstreet <kent.overstreet@gmail.com> | ||
155 | Description: | ||
156 | For a cache, sum of all btree writes in human readable units. | ||
diff --git a/Documentation/bcache.txt b/Documentation/bcache.txt new file mode 100644 index 000000000000..77db8809bd96 --- /dev/null +++ b/Documentation/bcache.txt | |||
@@ -0,0 +1,431 @@ | |||
1 | Say you've got a big slow raid 6, and an X-25E or three. Wouldn't it be | ||
2 | nice if you could use them as cache... Hence bcache. | ||
3 | |||
4 | Wiki and git repositories are at: | ||
5 | http://bcache.evilpiepirate.org | ||
6 | http://evilpiepirate.org/git/linux-bcache.git | ||
7 | http://evilpiepirate.org/git/bcache-tools.git | ||
8 | |||
9 | It's designed around the performance characteristics of SSDs - it only allocates | ||
10 | in erase block sized buckets, and it uses a hybrid btree/log to track cached | ||
11 | extants (which can be anywhere from a single sector to the bucket size). It's | ||
12 | designed to avoid random writes at all costs; it fills up an erase block | ||
13 | sequentially, then issues a discard before reusing it. | ||
14 | |||
15 | Both writethrough and writeback caching are supported. Writeback defaults to | ||
16 | off, but can be switched on and off arbitrarily at runtime. Bcache goes to | ||
17 | great lengths to protect your data - it reliably handles unclean shutdown. (It | ||
18 | doesn't even have a notion of a clean shutdown; bcache simply doesn't return | ||
19 | writes as completed until they're on stable storage). | ||
20 | |||
21 | Writeback caching can use most of the cache for buffering writes - writing | ||
22 | dirty data to the backing device is always done sequentially, scanning from the | ||
23 | start to the end of the index. | ||
24 | |||
25 | Since random IO is what SSDs excel at, there generally won't be much benefit | ||
26 | to caching large sequential IO. Bcache detects sequential IO and skips it; | ||
27 | it also keeps a rolling average of the IO sizes per task, and as long as the | ||
28 | average is above the cutoff it will skip all IO from that task - instead of | ||
29 | caching the first 512k after every seek. Backups and large file copies should | ||
30 | thus entirely bypass the cache. | ||
31 | |||
32 | In the event of a data IO error on the flash it will try to recover by reading | ||
33 | from disk or invalidating cache entries. For unrecoverable errors (meta data | ||
34 | or dirty data), caching is automatically disabled; if dirty data was present | ||
35 | in the cache it first disables writeback caching and waits for all dirty data | ||
36 | to be flushed. | ||
37 | |||
38 | Getting started: | ||
39 | You'll need make-bcache from the bcache-tools repository. Both the cache device | ||
40 | and backing device must be formatted before use. | ||
41 | make-bcache -B /dev/sdb | ||
42 | make-bcache -C /dev/sdc | ||
43 | |||
44 | make-bcache has the ability to format multiple devices at the same time - if | ||
45 | you format your backing devices and cache device at the same time, you won't | ||
46 | have to manually attach: | ||
47 | make-bcache -B /dev/sda /dev/sdb -C /dev/sdc | ||
48 | |||
49 | To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register: | ||
50 | |||
51 | echo /dev/sdb > /sys/fs/bcache/register | ||
52 | echo /dev/sdc > /sys/fs/bcache/register | ||
53 | |||
54 | To register your bcache devices automatically, you could add something like | ||
55 | this to an init script: | ||
56 | |||
57 | echo /dev/sd* > /sys/fs/bcache/register_quiet | ||
58 | |||
59 | It'll look for bcache superblocks and ignore everything that doesn't have one. | ||
60 | |||
61 | Registering the backing device makes the bcache show up in /dev; you can now | ||
62 | format it and use it as normal. But the first time using a new bcache device, | ||
63 | it'll be running in passthrough mode until you attach it to a cache. See the | ||
64 | section on attaching. | ||
65 | |||
66 | The devices show up at /dev/bcacheN, and can be controlled via sysfs from | ||
67 | /sys/block/bcacheN/bcache: | ||
68 | |||
69 | mkfs.ext4 /dev/bcache0 | ||
70 | mount /dev/bcache0 /mnt | ||
71 | |||
72 | Cache devices are managed as sets; multiple caches per set isn't supported yet | ||
73 | but will allow for mirroring of metadata and dirty data in the future. Your new | ||
74 | cache set shows up as /sys/fs/bcache/<UUID> | ||
75 | |||
76 | ATTACHING: | ||
77 | |||
78 | After your cache device and backing device are registered, the backing device | ||
79 | must be attached to your cache set to enable caching. Attaching a backing | ||
80 | device to a cache set is done thusly, with the UUID of the cache set in | ||
81 | /sys/fs/bcache: | ||
82 | |||
83 | echo <UUID> > /sys/block/bcache0/bcache/attach | ||
84 | |||
85 | This only has to be done once. The next time you reboot, just reregister all | ||
86 | your bcache devices. If a backing device has data in a cache somewhere, the | ||
87 | /dev/bcache# device won't be created until the cache shows up - particularly | ||
88 | important if you have writeback caching turned on. | ||
89 | |||
90 | If you're booting up and your cache device is gone and never coming back, you | ||
91 | can force run the backing device: | ||
92 | |||
93 | echo 1 > /sys/block/sdb/bcache/running | ||
94 | |||
95 | (You need to use /sys/block/sdb (or whatever your backing device is called), not | ||
96 | /sys/block/bcache0, because bcache0 doesn't exist yet. If you're using a | ||
97 | partition, the bcache directory would be at /sys/block/sdb/sdb2/bcache) | ||
98 | |||
99 | The backing device will still use that cache set if it shows up in the future, | ||
100 | but all the cached data will be invalidated. If there was dirty data in the | ||
101 | cache, don't expect the filesystem to be recoverable - you will have massive | ||
102 | filesystem corruption, though ext4's fsck does work miracles. | ||
103 | |||
104 | ERROR HANDLING: | ||
105 | |||
106 | Bcache tries to transparently handle IO errors to/from the cache device without | ||
107 | affecting normal operation; if it sees too many errors (the threshold is | ||
108 | configurable, and defaults to 0) it shuts down the cache device and switches all | ||
109 | the backing devices to passthrough mode. | ||
110 | |||
111 | - For reads from the cache, if they error we just retry the read from the | ||
112 | backing device. | ||
113 | |||
114 | - For writethrough writes, if the write to the cache errors we just switch to | ||
115 | invalidating the data at that lba in the cache (i.e. the same thing we do for | ||
116 | a write that bypasses the cache) | ||
117 | |||
118 | - For writeback writes, we currently pass that error back up to the | ||
119 | filesystem/userspace. This could be improved - we could retry it as a write | ||
120 | that skips the cache so we don't have to error the write. | ||
121 | |||
122 | - When we detach, we first try to flush any dirty data (if we were running in | ||
123 | writeback mode). It currently doesn't do anything intelligent if it fails to | ||
124 | read some of the dirty data, though. | ||
125 | |||
126 | TROUBLESHOOTING PERFORMANCE: | ||
127 | |||
128 | Bcache has a bunch of config options and tunables. The defaults are intended to | ||
129 | be reasonable for typical desktop and server workloads, but they're not what you | ||
130 | want for getting the best possible numbers when benchmarking. | ||
131 | |||
132 | - Bad write performance | ||
133 | |||
134 | If write performance is not what you expected, you probably wanted to be | ||
135 | running in writeback mode, which isn't the default (not due to a lack of | ||
136 | maturity, but simply because in writeback mode you'll lose data if something | ||
137 | happens to your SSD) | ||
138 | |||
139 | # echo writeback > /sys/block/bcache0/cache_mode | ||
140 | |||
141 | - Bad performance, or traffic not going to the SSD that you'd expect | ||
142 | |||
143 | By default, bcache doesn't cache everything. It tries to skip sequential IO - | ||
144 | because you really want to be caching the random IO, and if you copy a 10 | ||
145 | gigabyte file you probably don't want that pushing 10 gigabytes of randomly | ||
146 | accessed data out of your cache. | ||
147 | |||
148 | But if you want to benchmark reads from cache, and you start out with fio | ||
149 | writing an 8 gigabyte test file - so you want to disable that. | ||
150 | |||
151 | # echo 0 > /sys/block/bcache0/bcache/sequential_cutoff | ||
152 | |||
153 | To set it back to the default (4 mb), do | ||
154 | |||
155 | # echo 4M > /sys/block/bcache0/bcache/sequential_cutoff | ||
156 | |||
157 | - Traffic's still going to the spindle/still getting cache misses | ||
158 | |||
159 | In the real world, SSDs don't always keep up with disks - particularly with | ||
160 | slower SSDs, many disks being cached by one SSD, or mostly sequential IO. So | ||
161 | you want to avoid being bottlenecked by the SSD and having it slow everything | ||
162 | down. | ||
163 | |||
164 | To avoid that bcache tracks latency to the cache device, and gradually | ||
165 | throttles traffic if the latency exceeds a threshold (it does this by | ||
166 | cranking down the sequential bypass). | ||
167 | |||
168 | You can disable this if you need to by setting the thresholds to 0: | ||
169 | |||
170 | # echo 0 > /sys/fs/bcache/<cache set>/congested_read_threshold_us | ||
171 | # echo 0 > /sys/fs/bcache/<cache set>/congested_write_threshold_us | ||
172 | |||
173 | The default is 2000 us (2 milliseconds) for reads, and 20000 for writes. | ||
174 | |||
175 | - Still getting cache misses, of the same data | ||
176 | |||
177 | One last issue that sometimes trips people up is actually an old bug, due to | ||
178 | the way cache coherency is handled for cache misses. If a btree node is full, | ||
179 | a cache miss won't be able to insert a key for the new data and the data | ||
180 | won't be written to the cache. | ||
181 | |||
182 | In practice this isn't an issue because as soon as a write comes along it'll | ||
183 | cause the btree node to be split, and you need almost no write traffic for | ||
184 | this to not show up enough to be noticable (especially since bcache's btree | ||
185 | nodes are huge and index large regions of the device). But when you're | ||
186 | benchmarking, if you're trying to warm the cache by reading a bunch of data | ||
187 | and there's no other traffic - that can be a problem. | ||
188 | |||
189 | Solution: warm the cache by doing writes, or use the testing branch (there's | ||
190 | a fix for the issue there). | ||
191 | |||
192 | SYSFS - BACKING DEVICE: | ||
193 | |||
194 | attach | ||
195 | Echo the UUID of a cache set to this file to enable caching. | ||
196 | |||
197 | cache_mode | ||
198 | Can be one of either writethrough, writeback, writearound or none. | ||
199 | |||
200 | clear_stats | ||
201 | Writing to this file resets the running total stats (not the day/hour/5 minute | ||
202 | decaying versions). | ||
203 | |||
204 | detach | ||
205 | Write to this file to detach from a cache set. If there is dirty data in the | ||
206 | cache, it will be flushed first. | ||
207 | |||
208 | dirty_data | ||
209 | Amount of dirty data for this backing device in the cache. Continuously | ||
210 | updated unlike the cache set's version, but may be slightly off. | ||
211 | |||
212 | label | ||
213 | Name of underlying device. | ||
214 | |||
215 | readahead | ||
216 | Size of readahead that should be performed. Defaults to 0. If set to e.g. | ||
217 | 1M, it will round cache miss reads up to that size, but without overlapping | ||
218 | existing cache entries. | ||
219 | |||
220 | running | ||
221 | 1 if bcache is running (i.e. whether the /dev/bcache device exists, whether | ||
222 | it's in passthrough mode or caching). | ||
223 | |||
224 | sequential_cutoff | ||
225 | A sequential IO will bypass the cache once it passes this threshhold; the | ||
226 | most recent 128 IOs are tracked so sequential IO can be detected even when | ||
227 | it isn't all done at once. | ||
228 | |||
229 | sequential_merge | ||
230 | If non zero, bcache keeps a list of the last 128 requests submitted to compare | ||
231 | against all new requests to determine which new requests are sequential | ||
232 | continuations of previous requests for the purpose of determining sequential | ||
233 | cutoff. This is necessary if the sequential cutoff value is greater than the | ||
234 | maximum acceptable sequential size for any single request. | ||
235 | |||
236 | state | ||
237 | The backing device can be in one of four different states: | ||
238 | |||
239 | no cache: Has never been attached to a cache set. | ||
240 | |||
241 | clean: Part of a cache set, and there is no cached dirty data. | ||
242 | |||
243 | dirty: Part of a cache set, and there is cached dirty data. | ||
244 | |||
245 | inconsistent: The backing device was forcibly run by the user when there was | ||
246 | dirty data cached but the cache set was unavailable; whatever data was on the | ||
247 | backing device has likely been corrupted. | ||
248 | |||
249 | stop | ||
250 | Write to this file to shut down the bcache device and close the backing | ||
251 | device. | ||
252 | |||
253 | writeback_delay | ||
254 | When dirty data is written to the cache and it previously did not contain | ||
255 | any, waits some number of seconds before initiating writeback. Defaults to | ||
256 | 30. | ||
257 | |||
258 | writeback_percent | ||
259 | If nonzero, bcache tries to keep around this percentage of the cache dirty by | ||
260 | throttling background writeback and using a PD controller to smoothly adjust | ||
261 | the rate. | ||
262 | |||
263 | writeback_rate | ||
264 | Rate in sectors per second - if writeback_percent is nonzero, background | ||
265 | writeback is throttled to this rate. Continuously adjusted by bcache but may | ||
266 | also be set by the user. | ||
267 | |||
268 | writeback_running | ||
269 | If off, writeback of dirty data will not take place at all. Dirty data will | ||
270 | still be added to the cache until it is mostly full; only meant for | ||
271 | benchmarking. Defaults to on. | ||
272 | |||
273 | SYSFS - BACKING DEVICE STATS: | ||
274 | |||
275 | There are directories with these numbers for a running total, as well as | ||
276 | versions that decay over the past day, hour and 5 minutes; they're also | ||
277 | aggregated in the cache set directory as well. | ||
278 | |||
279 | bypassed | ||
280 | Amount of IO (both reads and writes) that has bypassed the cache | ||
281 | |||
282 | cache_hits | ||
283 | cache_misses | ||
284 | cache_hit_ratio | ||
285 | Hits and misses are counted per individual IO as bcache sees them; a | ||
286 | partial hit is counted as a miss. | ||
287 | |||
288 | cache_bypass_hits | ||
289 | cache_bypass_misses | ||
290 | Hits and misses for IO that is intended to skip the cache are still counted, | ||
291 | but broken out here. | ||
292 | |||
293 | cache_miss_collisions | ||
294 | Counts instances where data was going to be inserted into the cache from a | ||
295 | cache miss, but raced with a write and data was already present (usually 0 | ||
296 | since the synchronization for cache misses was rewritten) | ||
297 | |||
298 | cache_readaheads | ||
299 | Count of times readahead occured. | ||
300 | |||
301 | SYSFS - CACHE SET: | ||
302 | |||
303 | average_key_size | ||
304 | Average data per key in the btree. | ||
305 | |||
306 | bdev<0..n> | ||
307 | Symlink to each of the attached backing devices. | ||
308 | |||
309 | block_size | ||
310 | Block size of the cache devices. | ||
311 | |||
312 | btree_cache_size | ||
313 | Amount of memory currently used by the btree cache | ||
314 | |||
315 | bucket_size | ||
316 | Size of buckets | ||
317 | |||
318 | cache<0..n> | ||
319 | Symlink to each of the cache devices comprising this cache set. | ||
320 | |||
321 | cache_available_percent | ||
322 | Percentage of cache device free. | ||
323 | |||
324 | clear_stats | ||
325 | Clears the statistics associated with this cache | ||
326 | |||
327 | dirty_data | ||
328 | Amount of dirty data is in the cache (updated when garbage collection runs). | ||
329 | |||
330 | flash_vol_create | ||
331 | Echoing a size to this file (in human readable units, k/M/G) creates a thinly | ||
332 | provisioned volume backed by the cache set. | ||
333 | |||
334 | io_error_halflife | ||
335 | io_error_limit | ||
336 | These determines how many errors we accept before disabling the cache. | ||
337 | Each error is decayed by the half life (in # ios). If the decaying count | ||
338 | reaches io_error_limit dirty data is written out and the cache is disabled. | ||
339 | |||
340 | journal_delay_ms | ||
341 | Journal writes will delay for up to this many milliseconds, unless a cache | ||
342 | flush happens sooner. Defaults to 100. | ||
343 | |||
344 | root_usage_percent | ||
345 | Percentage of the root btree node in use. If this gets too high the node | ||
346 | will split, increasing the tree depth. | ||
347 | |||
348 | stop | ||
349 | Write to this file to shut down the cache set - waits until all attached | ||
350 | backing devices have been shut down. | ||
351 | |||
352 | tree_depth | ||
353 | Depth of the btree (A single node btree has depth 0). | ||
354 | |||
355 | unregister | ||
356 | Detaches all backing devices and closes the cache devices; if dirty data is | ||
357 | present it will disable writeback caching and wait for it to be flushed. | ||
358 | |||
359 | SYSFS - CACHE SET INTERNAL: | ||
360 | |||
361 | This directory also exposes timings for a number of internal operations, with | ||
362 | separate files for average duration, average frequency, last occurence and max | ||
363 | duration: garbage collection, btree read, btree node sorts and btree splits. | ||
364 | |||
365 | active_journal_entries | ||
366 | Number of journal entries that are newer than the index. | ||
367 | |||
368 | btree_nodes | ||
369 | Total nodes in the btree. | ||
370 | |||
371 | btree_used_percent | ||
372 | Average fraction of btree in use. | ||
373 | |||
374 | bset_tree_stats | ||
375 | Statistics about the auxiliary search trees | ||
376 | |||
377 | btree_cache_max_chain | ||
378 | Longest chain in the btree node cache's hash table | ||
379 | |||
380 | cache_read_races | ||
381 | Counts instances where while data was being read from the cache, the bucket | ||
382 | was reused and invalidated - i.e. where the pointer was stale after the read | ||
383 | completed. When this occurs the data is reread from the backing device. | ||
384 | |||
385 | trigger_gc | ||
386 | Writing to this file forces garbage collection to run. | ||
387 | |||
388 | SYSFS - CACHE DEVICE: | ||
389 | |||
390 | block_size | ||
391 | Minimum granularity of writes - should match hardware sector size. | ||
392 | |||
393 | btree_written | ||
394 | Sum of all btree writes, in (kilo/mega/giga) bytes | ||
395 | |||
396 | bucket_size | ||
397 | Size of buckets | ||
398 | |||
399 | cache_replacement_policy | ||
400 | One of either lru, fifo or random. | ||
401 | |||
402 | discard | ||
403 | Boolean; if on a discard/TRIM will be issued to each bucket before it is | ||
404 | reused. Defaults to off, since SATA TRIM is an unqueued command (and thus | ||
405 | slow). | ||
406 | |||
407 | freelist_percent | ||
408 | Size of the freelist as a percentage of nbuckets. Can be written to to | ||
409 | increase the number of buckets kept on the freelist, which lets you | ||
410 | artificially reduce the size of the cache at runtime. Mostly for testing | ||
411 | purposes (i.e. testing how different size caches affect your hit rate), but | ||
412 | since buckets are discarded when they move on to the freelist will also make | ||
413 | the SSD's garbage collection easier by effectively giving it more reserved | ||
414 | space. | ||
415 | |||
416 | io_errors | ||
417 | Number of errors that have occured, decayed by io_error_halflife. | ||
418 | |||
419 | metadata_written | ||
420 | Sum of all non data writes (btree writes and all other metadata). | ||
421 | |||
422 | nbuckets | ||
423 | Total buckets in this cache | ||
424 | |||
425 | priority_stats | ||
426 | Statistics about how recently data in the cache has been accessed. This can | ||
427 | reveal your working set size. | ||
428 | |||
429 | written | ||
430 | Sum of all data that has been written to the cache; comparison with | ||
431 | btree_written gives the amount of write inflation in bcache. | ||
diff --git a/MAINTAINERS b/MAINTAINERS index e73c374483cb..5f5c895e6621 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -1620,6 +1620,13 @@ W: http://www.baycom.org/~tom/ham/ham.html | |||
1620 | S: Maintained | 1620 | S: Maintained |
1621 | F: drivers/net/hamradio/baycom* | 1621 | F: drivers/net/hamradio/baycom* |
1622 | 1622 | ||
1623 | BCACHE (BLOCK LAYER CACHE) | ||
1624 | M: Kent Overstreet <koverstreet@google.com> | ||
1625 | L: linux-bcache@vger.kernel.org | ||
1626 | W: http://bcache.evilpiepirate.org | ||
1627 | S: Maintained: | ||
1628 | F: drivers/md/bcache/ | ||
1629 | |||
1623 | BEFS FILE SYSTEM | 1630 | BEFS FILE SYSTEM |
1624 | S: Orphan | 1631 | S: Orphan |
1625 | F: Documentation/filesystems/befs.txt | 1632 | F: Documentation/filesystems/befs.txt |
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 5efed089a702..fc803ecbbce4 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c | |||
@@ -920,16 +920,14 @@ bio_pagedec(struct bio *bio) | |||
920 | static void | 920 | static void |
921 | bufinit(struct buf *buf, struct request *rq, struct bio *bio) | 921 | bufinit(struct buf *buf, struct request *rq, struct bio *bio) |
922 | { | 922 | { |
923 | struct bio_vec *bv; | ||
924 | |||
925 | memset(buf, 0, sizeof(*buf)); | 923 | memset(buf, 0, sizeof(*buf)); |
926 | buf->rq = rq; | 924 | buf->rq = rq; |
927 | buf->bio = bio; | 925 | buf->bio = bio; |
928 | buf->resid = bio->bi_size; | 926 | buf->resid = bio->bi_size; |
929 | buf->sector = bio->bi_sector; | 927 | buf->sector = bio->bi_sector; |
930 | bio_pageinc(bio); | 928 | bio_pageinc(bio); |
931 | buf->bv = bv = bio_iovec(bio); | 929 | buf->bv = bio_iovec(bio); |
932 | buf->bv_resid = bv->bv_len; | 930 | buf->bv_resid = buf->bv->bv_len; |
933 | WARN_ON(buf->bv_resid == 0); | 931 | WARN_ON(buf->bv_resid == 0); |
934 | } | 932 | } |
935 | 933 | ||
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 94b51c5e0678..6374dc103521 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c | |||
@@ -75,6 +75,12 @@ module_param(cciss_simple_mode, int, S_IRUGO|S_IWUSR); | |||
75 | MODULE_PARM_DESC(cciss_simple_mode, | 75 | MODULE_PARM_DESC(cciss_simple_mode, |
76 | "Use 'simple mode' rather than 'performant mode'"); | 76 | "Use 'simple mode' rather than 'performant mode'"); |
77 | 77 | ||
78 | static int cciss_allow_hpsa; | ||
79 | module_param(cciss_allow_hpsa, int, S_IRUGO|S_IWUSR); | ||
80 | MODULE_PARM_DESC(cciss_allow_hpsa, | ||
81 | "Prevent cciss driver from accessing hardware known to be " | ||
82 | " supported by the hpsa driver"); | ||
83 | |||
78 | static DEFINE_MUTEX(cciss_mutex); | 84 | static DEFINE_MUTEX(cciss_mutex); |
79 | static struct proc_dir_entry *proc_cciss; | 85 | static struct proc_dir_entry *proc_cciss; |
80 | 86 | ||
@@ -4115,9 +4121,13 @@ static int cciss_lookup_board_id(struct pci_dev *pdev, u32 *board_id) | |||
4115 | *board_id = ((subsystem_device_id << 16) & 0xffff0000) | | 4121 | *board_id = ((subsystem_device_id << 16) & 0xffff0000) | |
4116 | subsystem_vendor_id; | 4122 | subsystem_vendor_id; |
4117 | 4123 | ||
4118 | for (i = 0; i < ARRAY_SIZE(products); i++) | 4124 | for (i = 0; i < ARRAY_SIZE(products); i++) { |
4125 | /* Stand aside for hpsa driver on request */ | ||
4126 | if (cciss_allow_hpsa) | ||
4127 | return -ENODEV; | ||
4119 | if (*board_id == products[i].board_id) | 4128 | if (*board_id == products[i].board_id) |
4120 | return i; | 4129 | return i; |
4130 | } | ||
4121 | dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n", | 4131 | dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n", |
4122 | *board_id); | 4132 | *board_id); |
4123 | return -ENODEV; | 4133 | return -ENODEV; |
@@ -4959,6 +4969,16 @@ static int cciss_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) | |||
4959 | ctlr_info_t *h; | 4969 | ctlr_info_t *h; |
4960 | unsigned long flags; | 4970 | unsigned long flags; |
4961 | 4971 | ||
4972 | /* | ||
4973 | * By default the cciss driver is used for all older HP Smart Array | ||
4974 | * controllers. There are module paramaters that allow a user to | ||
4975 | * override this behavior and instead use the hpsa SCSI driver. If | ||
4976 | * this is the case cciss may be loaded first from the kdump initrd | ||
4977 | * image and cause a kernel panic. So if reset_devices is true and | ||
4978 | * cciss_allow_hpsa is set just bail. | ||
4979 | */ | ||
4980 | if ((reset_devices) && (cciss_allow_hpsa == 1)) | ||
4981 | return -ENODEV; | ||
4962 | rc = cciss_init_reset_devices(pdev); | 4982 | rc = cciss_init_reset_devices(pdev); |
4963 | if (rc) { | 4983 | if (rc) { |
4964 | if (rc != -ENOTSUPP) | 4984 | if (rc != -ENOTSUPP) |
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 92510f8ad013..6608076dc39e 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c | |||
@@ -104,7 +104,6 @@ struct update_al_work { | |||
104 | int err; | 104 | int err; |
105 | }; | 105 | }; |
106 | 106 | ||
107 | static int al_write_transaction(struct drbd_conf *mdev); | ||
108 | 107 | ||
109 | void *drbd_md_get_buffer(struct drbd_conf *mdev) | 108 | void *drbd_md_get_buffer(struct drbd_conf *mdev) |
110 | { | 109 | { |
@@ -168,7 +167,11 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | |||
168 | bio->bi_end_io = drbd_md_io_complete; | 167 | bio->bi_end_io = drbd_md_io_complete; |
169 | bio->bi_rw = rw; | 168 | bio->bi_rw = rw; |
170 | 169 | ||
171 | if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ | 170 | if (!(rw & WRITE) && mdev->state.disk == D_DISKLESS && mdev->ldev == NULL) |
171 | /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */ | ||
172 | ; | ||
173 | else if (!get_ldev_if_state(mdev, D_ATTACHING)) { | ||
174 | /* Corresponding put_ldev in drbd_md_io_complete() */ | ||
172 | dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); | 175 | dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); |
173 | err = -ENODEV; | 176 | err = -ENODEV; |
174 | goto out; | 177 | goto out; |
@@ -199,9 +202,10 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | |||
199 | 202 | ||
200 | BUG_ON(!bdev->md_bdev); | 203 | BUG_ON(!bdev->md_bdev); |
201 | 204 | ||
202 | dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n", | 205 | dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", |
203 | current->comm, current->pid, __func__, | 206 | current->comm, current->pid, __func__, |
204 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | 207 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", |
208 | (void*)_RET_IP_ ); | ||
205 | 209 | ||
206 | if (sector < drbd_md_first_sector(bdev) || | 210 | if (sector < drbd_md_first_sector(bdev) || |
207 | sector + 7 > drbd_md_last_sector(bdev)) | 211 | sector + 7 > drbd_md_last_sector(bdev)) |
@@ -209,7 +213,8 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | |||
209 | current->comm, current->pid, __func__, | 213 | current->comm, current->pid, __func__, |
210 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | 214 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); |
211 | 215 | ||
212 | err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE); | 216 | /* we do all our meta data IO in aligned 4k blocks. */ |
217 | err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096); | ||
213 | if (err) { | 218 | if (err) { |
214 | dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", | 219 | dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", |
215 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); | 220 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); |
@@ -217,44 +222,99 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | |||
217 | return err; | 222 | return err; |
218 | } | 223 | } |
219 | 224 | ||
220 | static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) | 225 | static struct bm_extent *find_active_resync_extent(struct drbd_conf *mdev, unsigned int enr) |
221 | { | 226 | { |
222 | struct lc_element *al_ext; | ||
223 | struct lc_element *tmp; | 227 | struct lc_element *tmp; |
224 | int wake; | ||
225 | |||
226 | spin_lock_irq(&mdev->al_lock); | ||
227 | tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); | 228 | tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); |
228 | if (unlikely(tmp != NULL)) { | 229 | if (unlikely(tmp != NULL)) { |
229 | struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); | 230 | struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); |
230 | if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { | 231 | if (test_bit(BME_NO_WRITES, &bm_ext->flags)) |
231 | wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); | 232 | return bm_ext; |
232 | spin_unlock_irq(&mdev->al_lock); | 233 | } |
233 | if (wake) | 234 | return NULL; |
234 | wake_up(&mdev->al_wait); | 235 | } |
235 | return NULL; | 236 | |
236 | } | 237 | static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr, bool nonblock) |
238 | { | ||
239 | struct lc_element *al_ext; | ||
240 | struct bm_extent *bm_ext; | ||
241 | int wake; | ||
242 | |||
243 | spin_lock_irq(&mdev->al_lock); | ||
244 | bm_ext = find_active_resync_extent(mdev, enr); | ||
245 | if (bm_ext) { | ||
246 | wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); | ||
247 | spin_unlock_irq(&mdev->al_lock); | ||
248 | if (wake) | ||
249 | wake_up(&mdev->al_wait); | ||
250 | return NULL; | ||
237 | } | 251 | } |
238 | al_ext = lc_get(mdev->act_log, enr); | 252 | if (nonblock) |
253 | al_ext = lc_try_get(mdev->act_log, enr); | ||
254 | else | ||
255 | al_ext = lc_get(mdev->act_log, enr); | ||
239 | spin_unlock_irq(&mdev->al_lock); | 256 | spin_unlock_irq(&mdev->al_lock); |
240 | return al_ext; | 257 | return al_ext; |
241 | } | 258 | } |
242 | 259 | ||
243 | void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) | 260 | bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i) |
244 | { | 261 | { |
245 | /* for bios crossing activity log extent boundaries, | 262 | /* for bios crossing activity log extent boundaries, |
246 | * we may need to activate two extents in one go */ | 263 | * we may need to activate two extents in one go */ |
247 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); | 264 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); |
248 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); | 265 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); |
249 | unsigned enr; | ||
250 | bool locked = false; | ||
251 | 266 | ||
267 | D_ASSERT((unsigned)(last - first) <= 1); | ||
268 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); | ||
269 | |||
270 | /* FIXME figure out a fast path for bios crossing AL extent boundaries */ | ||
271 | if (first != last) | ||
272 | return false; | ||
273 | |||
274 | return _al_get(mdev, first, true); | ||
275 | } | ||
276 | |||
277 | bool drbd_al_begin_io_prepare(struct drbd_conf *mdev, struct drbd_interval *i) | ||
278 | { | ||
279 | /* for bios crossing activity log extent boundaries, | ||
280 | * we may need to activate two extents in one go */ | ||
281 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); | ||
282 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); | ||
283 | unsigned enr; | ||
284 | bool need_transaction = false; | ||
252 | 285 | ||
253 | D_ASSERT(first <= last); | 286 | D_ASSERT(first <= last); |
254 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); | 287 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); |
255 | 288 | ||
256 | for (enr = first; enr <= last; enr++) | 289 | for (enr = first; enr <= last; enr++) { |
257 | wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL); | 290 | struct lc_element *al_ext; |
291 | wait_event(mdev->al_wait, | ||
292 | (al_ext = _al_get(mdev, enr, false)) != NULL); | ||
293 | if (al_ext->lc_number != enr) | ||
294 | need_transaction = true; | ||
295 | } | ||
296 | return need_transaction; | ||
297 | } | ||
298 | |||
299 | static int al_write_transaction(struct drbd_conf *mdev, bool delegate); | ||
300 | |||
301 | /* When called through generic_make_request(), we must delegate | ||
302 | * activity log I/O to the worker thread: a further request | ||
303 | * submitted via generic_make_request() within the same task | ||
304 | * would be queued on current->bio_list, and would only start | ||
305 | * after this function returns (see generic_make_request()). | ||
306 | * | ||
307 | * However, if we *are* the worker, we must not delegate to ourselves. | ||
308 | */ | ||
309 | |||
310 | /* | ||
311 | * @delegate: delegate activity log I/O to the worker thread | ||
312 | */ | ||
313 | void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate) | ||
314 | { | ||
315 | bool locked = false; | ||
316 | |||
317 | BUG_ON(delegate && current == mdev->tconn->worker.task); | ||
258 | 318 | ||
259 | /* Serialize multiple transactions. | 319 | /* Serialize multiple transactions. |
260 | * This uses test_and_set_bit, memory barrier is implicit. | 320 | * This uses test_and_set_bit, memory barrier is implicit. |
@@ -264,13 +324,6 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) | |||
264 | (locked = lc_try_lock_for_transaction(mdev->act_log))); | 324 | (locked = lc_try_lock_for_transaction(mdev->act_log))); |
265 | 325 | ||
266 | if (locked) { | 326 | if (locked) { |
267 | /* drbd_al_write_transaction(mdev,al_ext,enr); | ||
268 | * recurses into generic_make_request(), which | ||
269 | * disallows recursion, bios being serialized on the | ||
270 | * current->bio_tail list now. | ||
271 | * we have to delegate updates to the activity log | ||
272 | * to the worker thread. */ | ||
273 | |||
274 | /* Double check: it may have been committed by someone else, | 327 | /* Double check: it may have been committed by someone else, |
275 | * while we have been waiting for the lock. */ | 328 | * while we have been waiting for the lock. */ |
276 | if (mdev->act_log->pending_changes) { | 329 | if (mdev->act_log->pending_changes) { |
@@ -280,11 +333,8 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) | |||
280 | write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; | 333 | write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; |
281 | rcu_read_unlock(); | 334 | rcu_read_unlock(); |
282 | 335 | ||
283 | if (write_al_updates) { | 336 | if (write_al_updates) |
284 | al_write_transaction(mdev); | 337 | al_write_transaction(mdev, delegate); |
285 | mdev->al_writ_cnt++; | ||
286 | } | ||
287 | |||
288 | spin_lock_irq(&mdev->al_lock); | 338 | spin_lock_irq(&mdev->al_lock); |
289 | /* FIXME | 339 | /* FIXME |
290 | if (err) | 340 | if (err) |
@@ -298,6 +348,66 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) | |||
298 | } | 348 | } |
299 | } | 349 | } |
300 | 350 | ||
351 | /* | ||
352 | * @delegate: delegate activity log I/O to the worker thread | ||
353 | */ | ||
354 | void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate) | ||
355 | { | ||
356 | BUG_ON(delegate && current == mdev->tconn->worker.task); | ||
357 | |||
358 | if (drbd_al_begin_io_prepare(mdev, i)) | ||
359 | drbd_al_begin_io_commit(mdev, delegate); | ||
360 | } | ||
361 | |||
362 | int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i) | ||
363 | { | ||
364 | struct lru_cache *al = mdev->act_log; | ||
365 | /* for bios crossing activity log extent boundaries, | ||
366 | * we may need to activate two extents in one go */ | ||
367 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); | ||
368 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); | ||
369 | unsigned nr_al_extents; | ||
370 | unsigned available_update_slots; | ||
371 | unsigned enr; | ||
372 | |||
373 | D_ASSERT(first <= last); | ||
374 | |||
375 | nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */ | ||
376 | available_update_slots = min(al->nr_elements - al->used, | ||
377 | al->max_pending_changes - al->pending_changes); | ||
378 | |||
379 | /* We want all necessary updates for a given request within the same transaction | ||
380 | * We could first check how many updates are *actually* needed, | ||
381 | * and use that instead of the worst-case nr_al_extents */ | ||
382 | if (available_update_slots < nr_al_extents) | ||
383 | return -EWOULDBLOCK; | ||
384 | |||
385 | /* Is resync active in this area? */ | ||
386 | for (enr = first; enr <= last; enr++) { | ||
387 | struct lc_element *tmp; | ||
388 | tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); | ||
389 | if (unlikely(tmp != NULL)) { | ||
390 | struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); | ||
391 | if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { | ||
392 | if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags)) | ||
393 | return -EBUSY; | ||
394 | return -EWOULDBLOCK; | ||
395 | } | ||
396 | } | ||
397 | } | ||
398 | |||
399 | /* Checkout the refcounts. | ||
400 | * Given that we checked for available elements and update slots above, | ||
401 | * this has to be successful. */ | ||
402 | for (enr = first; enr <= last; enr++) { | ||
403 | struct lc_element *al_ext; | ||
404 | al_ext = lc_get_cumulative(mdev->act_log, enr); | ||
405 | if (!al_ext) | ||
406 | dev_info(DEV, "LOGIC BUG for enr=%u\n", enr); | ||
407 | } | ||
408 | return 0; | ||
409 | } | ||
410 | |||
301 | void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) | 411 | void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) |
302 | { | 412 | { |
303 | /* for bios crossing activity log extent boundaries, | 413 | /* for bios crossing activity log extent boundaries, |
@@ -350,6 +460,24 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) | |||
350 | (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); | 460 | (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); |
351 | } | 461 | } |
352 | 462 | ||
463 | static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev) | ||
464 | { | ||
465 | const unsigned int stripes = mdev->ldev->md.al_stripes; | ||
466 | const unsigned int stripe_size_4kB = mdev->ldev->md.al_stripe_size_4k; | ||
467 | |||
468 | /* transaction number, modulo on-disk ring buffer wrap around */ | ||
469 | unsigned int t = mdev->al_tr_number % (mdev->ldev->md.al_size_4k); | ||
470 | |||
471 | /* ... to aligned 4k on disk block */ | ||
472 | t = ((t % stripes) * stripe_size_4kB) + t/stripes; | ||
473 | |||
474 | /* ... to 512 byte sector in activity log */ | ||
475 | t *= 8; | ||
476 | |||
477 | /* ... plus offset to the on disk position */ | ||
478 | return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t; | ||
479 | } | ||
480 | |||
353 | static int | 481 | static int |
354 | _al_write_transaction(struct drbd_conf *mdev) | 482 | _al_write_transaction(struct drbd_conf *mdev) |
355 | { | 483 | { |
@@ -432,23 +560,27 @@ _al_write_transaction(struct drbd_conf *mdev) | |||
432 | if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) | 560 | if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) |
433 | mdev->al_tr_cycle = 0; | 561 | mdev->al_tr_cycle = 0; |
434 | 562 | ||
435 | sector = mdev->ldev->md.md_offset | 563 | sector = al_tr_number_to_on_disk_sector(mdev); |
436 | + mdev->ldev->md.al_offset | ||
437 | + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9); | ||
438 | 564 | ||
439 | crc = crc32c(0, buffer, 4096); | 565 | crc = crc32c(0, buffer, 4096); |
440 | buffer->crc32c = cpu_to_be32(crc); | 566 | buffer->crc32c = cpu_to_be32(crc); |
441 | 567 | ||
442 | if (drbd_bm_write_hinted(mdev)) | 568 | if (drbd_bm_write_hinted(mdev)) |
443 | err = -EIO; | 569 | err = -EIO; |
444 | /* drbd_chk_io_error done already */ | 570 | else { |
445 | else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { | 571 | bool write_al_updates; |
446 | err = -EIO; | 572 | rcu_read_lock(); |
447 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | 573 | write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; |
448 | } else { | 574 | rcu_read_unlock(); |
449 | /* advance ringbuffer position and transaction counter */ | 575 | if (write_al_updates) { |
450 | mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE); | 576 | if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { |
451 | mdev->al_tr_number++; | 577 | err = -EIO; |
578 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | ||
579 | } else { | ||
580 | mdev->al_tr_number++; | ||
581 | mdev->al_writ_cnt++; | ||
582 | } | ||
583 | } | ||
452 | } | 584 | } |
453 | 585 | ||
454 | drbd_md_put_buffer(mdev); | 586 | drbd_md_put_buffer(mdev); |
@@ -474,20 +606,18 @@ static int w_al_write_transaction(struct drbd_work *w, int unused) | |||
474 | /* Calls from worker context (see w_restart_disk_io()) need to write the | 606 | /* Calls from worker context (see w_restart_disk_io()) need to write the |
475 | transaction directly. Others came through generic_make_request(), | 607 | transaction directly. Others came through generic_make_request(), |
476 | those need to delegate it to the worker. */ | 608 | those need to delegate it to the worker. */ |
477 | static int al_write_transaction(struct drbd_conf *mdev) | 609 | static int al_write_transaction(struct drbd_conf *mdev, bool delegate) |
478 | { | 610 | { |
479 | struct update_al_work al_work; | 611 | if (delegate) { |
480 | 612 | struct update_al_work al_work; | |
481 | if (current == mdev->tconn->worker.task) | 613 | init_completion(&al_work.event); |
614 | al_work.w.cb = w_al_write_transaction; | ||
615 | al_work.w.mdev = mdev; | ||
616 | drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); | ||
617 | wait_for_completion(&al_work.event); | ||
618 | return al_work.err; | ||
619 | } else | ||
482 | return _al_write_transaction(mdev); | 620 | return _al_write_transaction(mdev); |
483 | |||
484 | init_completion(&al_work.event); | ||
485 | al_work.w.cb = w_al_write_transaction; | ||
486 | al_work.w.mdev = mdev; | ||
487 | drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); | ||
488 | wait_for_completion(&al_work.event); | ||
489 | |||
490 | return al_work.err; | ||
491 | } | 621 | } |
492 | 622 | ||
493 | static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) | 623 | static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) |
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 8dc29502dc08..64fbb8385cdc 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c | |||
@@ -612,6 +612,17 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) | |||
612 | } | 612 | } |
613 | } | 613 | } |
614 | 614 | ||
615 | /* For the layout, see comment above drbd_md_set_sector_offsets(). */ | ||
616 | static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev) | ||
617 | { | ||
618 | u64 bitmap_sectors; | ||
619 | if (ldev->md.al_offset == 8) | ||
620 | bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset; | ||
621 | else | ||
622 | bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset; | ||
623 | return bitmap_sectors << (9 + 3); | ||
624 | } | ||
625 | |||
615 | /* | 626 | /* |
616 | * make sure the bitmap has enough room for the attached storage, | 627 | * make sure the bitmap has enough room for the attached storage, |
617 | * if necessary, resize. | 628 | * if necessary, resize. |
@@ -668,7 +679,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) | |||
668 | words = ALIGN(bits, 64) >> LN2_BPL; | 679 | words = ALIGN(bits, 64) >> LN2_BPL; |
669 | 680 | ||
670 | if (get_ldev(mdev)) { | 681 | if (get_ldev(mdev)) { |
671 | u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12; | 682 | u64 bits_on_disk = drbd_md_on_disk_bits(mdev->ldev); |
672 | put_ldev(mdev); | 683 | put_ldev(mdev); |
673 | if (bits > bits_on_disk) { | 684 | if (bits > bits_on_disk) { |
674 | dev_info(DEV, "bits = %lu\n", bits); | 685 | dev_info(DEV, "bits = %lu\n", bits); |
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 6b51afa1aae1..f943aacfdad8 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -753,13 +753,16 @@ struct drbd_md { | |||
753 | u32 flags; | 753 | u32 flags; |
754 | u32 md_size_sect; | 754 | u32 md_size_sect; |
755 | 755 | ||
756 | s32 al_offset; /* signed relative sector offset to al area */ | 756 | s32 al_offset; /* signed relative sector offset to activity log */ |
757 | s32 bm_offset; /* signed relative sector offset to bitmap */ | 757 | s32 bm_offset; /* signed relative sector offset to bitmap */ |
758 | 758 | ||
759 | /* u32 al_nr_extents; important for restoring the AL | 759 | /* cached value of bdev->disk_conf->meta_dev_idx (see below) */ |
760 | * is stored into ldev->dc.al_extents, which in turn | 760 | s32 meta_dev_idx; |
761 | * gets applied to act_log->nr_elements | 761 | |
762 | */ | 762 | /* see al_tr_number_to_on_disk_sector() */ |
763 | u32 al_stripes; | ||
764 | u32 al_stripe_size_4k; | ||
765 | u32 al_size_4k; /* cached product of the above */ | ||
763 | }; | 766 | }; |
764 | 767 | ||
765 | struct drbd_backing_dev { | 768 | struct drbd_backing_dev { |
@@ -891,6 +894,14 @@ struct drbd_tconn { /* is a resource from the config file */ | |||
891 | } send; | 894 | } send; |
892 | }; | 895 | }; |
893 | 896 | ||
897 | struct submit_worker { | ||
898 | struct workqueue_struct *wq; | ||
899 | struct work_struct worker; | ||
900 | |||
901 | spinlock_t lock; | ||
902 | struct list_head writes; | ||
903 | }; | ||
904 | |||
894 | struct drbd_conf { | 905 | struct drbd_conf { |
895 | struct drbd_tconn *tconn; | 906 | struct drbd_tconn *tconn; |
896 | int vnr; /* volume number within the connection */ | 907 | int vnr; /* volume number within the connection */ |
@@ -1009,7 +1020,6 @@ struct drbd_conf { | |||
1009 | struct lru_cache *act_log; /* activity log */ | 1020 | struct lru_cache *act_log; /* activity log */ |
1010 | unsigned int al_tr_number; | 1021 | unsigned int al_tr_number; |
1011 | int al_tr_cycle; | 1022 | int al_tr_cycle; |
1012 | int al_tr_pos; /* position of the next transaction in the journal */ | ||
1013 | wait_queue_head_t seq_wait; | 1023 | wait_queue_head_t seq_wait; |
1014 | atomic_t packet_seq; | 1024 | atomic_t packet_seq; |
1015 | unsigned int peer_seq; | 1025 | unsigned int peer_seq; |
@@ -1032,6 +1042,10 @@ struct drbd_conf { | |||
1032 | atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ | 1042 | atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ |
1033 | unsigned int peer_max_bio_size; | 1043 | unsigned int peer_max_bio_size; |
1034 | unsigned int local_max_bio_size; | 1044 | unsigned int local_max_bio_size; |
1045 | |||
1046 | /* any requests that would block in drbd_make_request() | ||
1047 | * are deferred to this single-threaded work queue */ | ||
1048 | struct submit_worker submit; | ||
1035 | }; | 1049 | }; |
1036 | 1050 | ||
1037 | static inline struct drbd_conf *minor_to_mdev(unsigned int minor) | 1051 | static inline struct drbd_conf *minor_to_mdev(unsigned int minor) |
@@ -1148,25 +1162,44 @@ extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, | |||
1148 | char *why, enum bm_flag flags); | 1162 | char *why, enum bm_flag flags); |
1149 | extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); | 1163 | extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); |
1150 | extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); | 1164 | extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); |
1151 | extern void drbd_go_diskless(struct drbd_conf *mdev); | ||
1152 | extern void drbd_ldev_destroy(struct drbd_conf *mdev); | 1165 | extern void drbd_ldev_destroy(struct drbd_conf *mdev); |
1153 | 1166 | ||
1154 | /* Meta data layout | 1167 | /* Meta data layout |
1155 | We reserve a 128MB Block (4k aligned) | 1168 | * |
1156 | * either at the end of the backing device | 1169 | * We currently have two possible layouts. |
1157 | * or on a separate meta data device. */ | 1170 | * Offsets in (512 byte) sectors. |
1171 | * external: | ||
1172 | * |----------- md_size_sect ------------------| | ||
1173 | * [ 4k superblock ][ activity log ][ Bitmap ] | ||
1174 | * | al_offset == 8 | | ||
1175 | * | bm_offset = al_offset + X | | ||
1176 | * ==> bitmap sectors = md_size_sect - bm_offset | ||
1177 | * | ||
1178 | * Variants: | ||
1179 | * old, indexed fixed size meta data: | ||
1180 | * | ||
1181 | * internal: | ||
1182 | * |----------- md_size_sect ------------------| | ||
1183 | * [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*] | ||
1184 | * | al_offset < 0 | | ||
1185 | * | bm_offset = al_offset - Y | | ||
1186 | * ==> bitmap sectors = Y = al_offset - bm_offset | ||
1187 | * | ||
1188 | * [padding*] are zero or up to 7 unused 512 Byte sectors to the | ||
1189 | * end of the device, so that the [4k superblock] will be 4k aligned. | ||
1190 | * | ||
1191 | * The activity log consists of 4k transaction blocks, | ||
1192 | * which are written in a ring-buffer, or striped ring-buffer like fashion, | ||
1193 | * which are writtensize used to be fixed 32kB, | ||
1194 | * but is about to become configurable. | ||
1195 | */ | ||
1158 | 1196 | ||
1159 | /* The following numbers are sectors */ | 1197 | /* Our old fixed size meta data layout |
1160 | /* Allows up to about 3.8TB, so if you want more, | 1198 | * allows up to about 3.8TB, so if you want more, |
1161 | * you need to use the "flexible" meta data format. */ | 1199 | * you need to use the "flexible" meta data format. */ |
1162 | #define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ | 1200 | #define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */ |
1163 | #define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ | 1201 | #define MD_4kB_SECT 8 |
1164 | #define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */ | 1202 | #define MD_32kB_SECT 64 |
1165 | #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS) | ||
1166 | |||
1167 | /* we do all meta data IO in 4k blocks */ | ||
1168 | #define MD_BLOCK_SHIFT 12 | ||
1169 | #define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT) | ||
1170 | 1203 | ||
1171 | /* One activity log extent represents 4M of storage */ | 1204 | /* One activity log extent represents 4M of storage */ |
1172 | #define AL_EXTENT_SHIFT 22 | 1205 | #define AL_EXTENT_SHIFT 22 |
@@ -1256,7 +1289,6 @@ struct bm_extent { | |||
1256 | 1289 | ||
1257 | /* in one sector of the bitmap, we have this many activity_log extents. */ | 1290 | /* in one sector of the bitmap, we have this many activity_log extents. */ |
1258 | #define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) | 1291 | #define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) |
1259 | #define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) | ||
1260 | 1292 | ||
1261 | #define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) | 1293 | #define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) |
1262 | #define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1) | 1294 | #define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1) |
@@ -1276,16 +1308,18 @@ struct bm_extent { | |||
1276 | */ | 1308 | */ |
1277 | 1309 | ||
1278 | #define DRBD_MAX_SECTORS_32 (0xffffffffLU) | 1310 | #define DRBD_MAX_SECTORS_32 (0xffffffffLU) |
1279 | #define DRBD_MAX_SECTORS_BM \ | 1311 | /* we have a certain meta data variant that has a fixed on-disk size of 128 |
1280 | ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9))) | 1312 | * MiB, of which 4k are our "superblock", and 32k are the fixed size activity |
1281 | #if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32 | 1313 | * log, leaving this many sectors for the bitmap. |
1282 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM | 1314 | */ |
1283 | #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM | 1315 | |
1284 | #elif !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32 | 1316 | #define DRBD_MAX_SECTORS_FIXED_BM \ |
1317 | ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9))) | ||
1318 | #if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32 | ||
1285 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32 | 1319 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32 |
1286 | #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32 | 1320 | #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32 |
1287 | #else | 1321 | #else |
1288 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM | 1322 | #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM |
1289 | /* 16 TB in units of sectors */ | 1323 | /* 16 TB in units of sectors */ |
1290 | #if BITS_PER_LONG == 32 | 1324 | #if BITS_PER_LONG == 32 |
1291 | /* adjust by one page worth of bitmap, | 1325 | /* adjust by one page worth of bitmap, |
@@ -1418,6 +1452,7 @@ extern void conn_free_crypto(struct drbd_tconn *tconn); | |||
1418 | extern int proc_details; | 1452 | extern int proc_details; |
1419 | 1453 | ||
1420 | /* drbd_req */ | 1454 | /* drbd_req */ |
1455 | extern void do_submit(struct work_struct *ws); | ||
1421 | extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long); | 1456 | extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long); |
1422 | extern void drbd_make_request(struct request_queue *q, struct bio *bio); | 1457 | extern void drbd_make_request(struct request_queue *q, struct bio *bio); |
1423 | extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); | 1458 | extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); |
@@ -1576,7 +1611,10 @@ extern const char *drbd_conn_str(enum drbd_conns s); | |||
1576 | extern const char *drbd_role_str(enum drbd_role s); | 1611 | extern const char *drbd_role_str(enum drbd_role s); |
1577 | 1612 | ||
1578 | /* drbd_actlog.c */ | 1613 | /* drbd_actlog.c */ |
1579 | extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i); | 1614 | extern int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i); |
1615 | extern void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate); | ||
1616 | extern bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i); | ||
1617 | extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate); | ||
1580 | extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i); | 1618 | extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i); |
1581 | extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); | 1619 | extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); |
1582 | extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); | 1620 | extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); |
@@ -1755,9 +1793,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev, | |||
1755 | * BTW, for internal meta data, this happens to be the maximum capacity | 1793 | * BTW, for internal meta data, this happens to be the maximum capacity |
1756 | * we could agree upon with our peer node. | 1794 | * we could agree upon with our peer node. |
1757 | */ | 1795 | */ |
1758 | static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev) | 1796 | static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) |
1759 | { | 1797 | { |
1760 | switch (meta_dev_idx) { | 1798 | switch (bdev->md.meta_dev_idx) { |
1761 | case DRBD_MD_INDEX_INTERNAL: | 1799 | case DRBD_MD_INDEX_INTERNAL: |
1762 | case DRBD_MD_INDEX_FLEX_INT: | 1800 | case DRBD_MD_INDEX_FLEX_INT: |
1763 | return bdev->md.md_offset + bdev->md.bm_offset; | 1801 | return bdev->md.md_offset + bdev->md.bm_offset; |
@@ -1767,36 +1805,19 @@ static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backi | |||
1767 | } | 1805 | } |
1768 | } | 1806 | } |
1769 | 1807 | ||
1770 | static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) | ||
1771 | { | ||
1772 | int meta_dev_idx; | ||
1773 | |||
1774 | rcu_read_lock(); | ||
1775 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1776 | rcu_read_unlock(); | ||
1777 | |||
1778 | return _drbd_md_first_sector(meta_dev_idx, bdev); | ||
1779 | } | ||
1780 | |||
1781 | /** | 1808 | /** |
1782 | * drbd_md_last_sector() - Return the last sector number of the meta data area | 1809 | * drbd_md_last_sector() - Return the last sector number of the meta data area |
1783 | * @bdev: Meta data block device. | 1810 | * @bdev: Meta data block device. |
1784 | */ | 1811 | */ |
1785 | static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) | 1812 | static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) |
1786 | { | 1813 | { |
1787 | int meta_dev_idx; | 1814 | switch (bdev->md.meta_dev_idx) { |
1788 | |||
1789 | rcu_read_lock(); | ||
1790 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1791 | rcu_read_unlock(); | ||
1792 | |||
1793 | switch (meta_dev_idx) { | ||
1794 | case DRBD_MD_INDEX_INTERNAL: | 1815 | case DRBD_MD_INDEX_INTERNAL: |
1795 | case DRBD_MD_INDEX_FLEX_INT: | 1816 | case DRBD_MD_INDEX_FLEX_INT: |
1796 | return bdev->md.md_offset + MD_AL_OFFSET - 1; | 1817 | return bdev->md.md_offset + MD_4kB_SECT -1; |
1797 | case DRBD_MD_INDEX_FLEX_EXT: | 1818 | case DRBD_MD_INDEX_FLEX_EXT: |
1798 | default: | 1819 | default: |
1799 | return bdev->md.md_offset + bdev->md.md_size_sect; | 1820 | return bdev->md.md_offset + bdev->md.md_size_sect -1; |
1800 | } | 1821 | } |
1801 | } | 1822 | } |
1802 | 1823 | ||
@@ -1818,18 +1839,13 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev) | |||
1818 | static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) | 1839 | static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) |
1819 | { | 1840 | { |
1820 | sector_t s; | 1841 | sector_t s; |
1821 | int meta_dev_idx; | ||
1822 | 1842 | ||
1823 | rcu_read_lock(); | 1843 | switch (bdev->md.meta_dev_idx) { |
1824 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1825 | rcu_read_unlock(); | ||
1826 | |||
1827 | switch (meta_dev_idx) { | ||
1828 | case DRBD_MD_INDEX_INTERNAL: | 1844 | case DRBD_MD_INDEX_INTERNAL: |
1829 | case DRBD_MD_INDEX_FLEX_INT: | 1845 | case DRBD_MD_INDEX_FLEX_INT: |
1830 | s = drbd_get_capacity(bdev->backing_bdev) | 1846 | s = drbd_get_capacity(bdev->backing_bdev) |
1831 | ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, | 1847 | ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, |
1832 | _drbd_md_first_sector(meta_dev_idx, bdev)) | 1848 | drbd_md_first_sector(bdev)) |
1833 | : 0; | 1849 | : 0; |
1834 | break; | 1850 | break; |
1835 | case DRBD_MD_INDEX_FLEX_EXT: | 1851 | case DRBD_MD_INDEX_FLEX_EXT: |
@@ -1848,39 +1864,24 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) | |||
1848 | } | 1864 | } |
1849 | 1865 | ||
1850 | /** | 1866 | /** |
1851 | * drbd_md_ss__() - Return the sector number of our meta data super block | 1867 | * drbd_md_ss() - Return the sector number of our meta data super block |
1852 | * @mdev: DRBD device. | ||
1853 | * @bdev: Meta data block device. | 1868 | * @bdev: Meta data block device. |
1854 | */ | 1869 | */ |
1855 | static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, | 1870 | static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev) |
1856 | struct drbd_backing_dev *bdev) | ||
1857 | { | 1871 | { |
1858 | int meta_dev_idx; | 1872 | const int meta_dev_idx = bdev->md.meta_dev_idx; |
1859 | 1873 | ||
1860 | rcu_read_lock(); | 1874 | if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT) |
1861 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
1862 | rcu_read_unlock(); | ||
1863 | |||
1864 | switch (meta_dev_idx) { | ||
1865 | default: /* external, some index */ | ||
1866 | return MD_RESERVED_SECT * meta_dev_idx; | ||
1867 | case DRBD_MD_INDEX_INTERNAL: | ||
1868 | /* with drbd08, internal meta data is always "flexible" */ | ||
1869 | case DRBD_MD_INDEX_FLEX_INT: | ||
1870 | /* sizeof(struct md_on_disk_07) == 4k | ||
1871 | * position: last 4k aligned block of 4k size */ | ||
1872 | if (!bdev->backing_bdev) { | ||
1873 | if (__ratelimit(&drbd_ratelimit_state)) { | ||
1874 | dev_err(DEV, "bdev->backing_bdev==NULL\n"); | ||
1875 | dump_stack(); | ||
1876 | } | ||
1877 | return 0; | ||
1878 | } | ||
1879 | return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) | ||
1880 | - MD_AL_OFFSET; | ||
1881 | case DRBD_MD_INDEX_FLEX_EXT: | ||
1882 | return 0; | 1875 | return 0; |
1883 | } | 1876 | |
1877 | /* Since drbd08, internal meta data is always "flexible". | ||
1878 | * position: last 4k aligned block of 4k size */ | ||
1879 | if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL || | ||
1880 | meta_dev_idx == DRBD_MD_INDEX_FLEX_INT) | ||
1881 | return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8; | ||
1882 | |||
1883 | /* external, some index; this is the old fixed size layout */ | ||
1884 | return MD_128MB_SECT * bdev->md.meta_dev_idx; | ||
1884 | } | 1885 | } |
1885 | 1886 | ||
1886 | static inline void | 1887 | static inline void |
@@ -2053,9 +2054,11 @@ static inline void put_ldev(struct drbd_conf *mdev) | |||
2053 | if (mdev->state.disk == D_DISKLESS) | 2054 | if (mdev->state.disk == D_DISKLESS) |
2054 | /* even internal references gone, safe to destroy */ | 2055 | /* even internal references gone, safe to destroy */ |
2055 | drbd_ldev_destroy(mdev); | 2056 | drbd_ldev_destroy(mdev); |
2056 | if (mdev->state.disk == D_FAILED) | 2057 | if (mdev->state.disk == D_FAILED) { |
2057 | /* all application IO references gone. */ | 2058 | /* all application IO references gone. */ |
2058 | drbd_go_diskless(mdev); | 2059 | if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) |
2060 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless); | ||
2061 | } | ||
2059 | wake_up(&mdev->misc_wait); | 2062 | wake_up(&mdev->misc_wait); |
2060 | } | 2063 | } |
2061 | } | 2064 | } |
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 298b868910dc..a5dca6affcbb 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -45,7 +45,7 @@ | |||
45 | #include <linux/reboot.h> | 45 | #include <linux/reboot.h> |
46 | #include <linux/notifier.h> | 46 | #include <linux/notifier.h> |
47 | #include <linux/kthread.h> | 47 | #include <linux/kthread.h> |
48 | 48 | #include <linux/workqueue.h> | |
49 | #define __KERNEL_SYSCALLS__ | 49 | #define __KERNEL_SYSCALLS__ |
50 | #include <linux/unistd.h> | 50 | #include <linux/unistd.h> |
51 | #include <linux/vmalloc.h> | 51 | #include <linux/vmalloc.h> |
@@ -2299,6 +2299,7 @@ static void drbd_cleanup(void) | |||
2299 | idr_for_each_entry(&minors, mdev, i) { | 2299 | idr_for_each_entry(&minors, mdev, i) { |
2300 | idr_remove(&minors, mdev_to_minor(mdev)); | 2300 | idr_remove(&minors, mdev_to_minor(mdev)); |
2301 | idr_remove(&mdev->tconn->volumes, mdev->vnr); | 2301 | idr_remove(&mdev->tconn->volumes, mdev->vnr); |
2302 | destroy_workqueue(mdev->submit.wq); | ||
2302 | del_gendisk(mdev->vdisk); | 2303 | del_gendisk(mdev->vdisk); |
2303 | /* synchronize_rcu(); No other threads running at this point */ | 2304 | /* synchronize_rcu(); No other threads running at this point */ |
2304 | kref_put(&mdev->kref, &drbd_minor_destroy); | 2305 | kref_put(&mdev->kref, &drbd_minor_destroy); |
@@ -2588,6 +2589,21 @@ void conn_destroy(struct kref *kref) | |||
2588 | kfree(tconn); | 2589 | kfree(tconn); |
2589 | } | 2590 | } |
2590 | 2591 | ||
2592 | int init_submitter(struct drbd_conf *mdev) | ||
2593 | { | ||
2594 | /* opencoded create_singlethread_workqueue(), | ||
2595 | * to be able to say "drbd%d", ..., minor */ | ||
2596 | mdev->submit.wq = alloc_workqueue("drbd%u_submit", | ||
2597 | WQ_UNBOUND | WQ_MEM_RECLAIM, 1, mdev->minor); | ||
2598 | if (!mdev->submit.wq) | ||
2599 | return -ENOMEM; | ||
2600 | |||
2601 | INIT_WORK(&mdev->submit.worker, do_submit); | ||
2602 | spin_lock_init(&mdev->submit.lock); | ||
2603 | INIT_LIST_HEAD(&mdev->submit.writes); | ||
2604 | return 0; | ||
2605 | } | ||
2606 | |||
2591 | enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr) | 2607 | enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr) |
2592 | { | 2608 | { |
2593 | struct drbd_conf *mdev; | 2609 | struct drbd_conf *mdev; |
@@ -2677,6 +2693,12 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, | |||
2677 | goto out_idr_remove_minor; | 2693 | goto out_idr_remove_minor; |
2678 | } | 2694 | } |
2679 | 2695 | ||
2696 | if (init_submitter(mdev)) { | ||
2697 | err = ERR_NOMEM; | ||
2698 | drbd_msg_put_info("unable to create submit workqueue"); | ||
2699 | goto out_idr_remove_vol; | ||
2700 | } | ||
2701 | |||
2680 | add_disk(disk); | 2702 | add_disk(disk); |
2681 | kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */ | 2703 | kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */ |
2682 | 2704 | ||
@@ -2687,6 +2709,8 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, | |||
2687 | 2709 | ||
2688 | return NO_ERROR; | 2710 | return NO_ERROR; |
2689 | 2711 | ||
2712 | out_idr_remove_vol: | ||
2713 | idr_remove(&tconn->volumes, vnr_got); | ||
2690 | out_idr_remove_minor: | 2714 | out_idr_remove_minor: |
2691 | idr_remove(&minors, minor_got); | 2715 | idr_remove(&minors, minor_got); |
2692 | synchronize_rcu(); | 2716 | synchronize_rcu(); |
@@ -2794,6 +2818,7 @@ void drbd_free_bc(struct drbd_backing_dev *ldev) | |||
2794 | blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); | 2818 | blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
2795 | blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); | 2819 | blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
2796 | 2820 | ||
2821 | kfree(ldev->disk_conf); | ||
2797 | kfree(ldev); | 2822 | kfree(ldev); |
2798 | } | 2823 | } |
2799 | 2824 | ||
@@ -2833,8 +2858,9 @@ void conn_md_sync(struct drbd_tconn *tconn) | |||
2833 | rcu_read_unlock(); | 2858 | rcu_read_unlock(); |
2834 | } | 2859 | } |
2835 | 2860 | ||
2861 | /* aligned 4kByte */ | ||
2836 | struct meta_data_on_disk { | 2862 | struct meta_data_on_disk { |
2837 | u64 la_size; /* last agreed size. */ | 2863 | u64 la_size_sect; /* last agreed size. */ |
2838 | u64 uuid[UI_SIZE]; /* UUIDs. */ | 2864 | u64 uuid[UI_SIZE]; /* UUIDs. */ |
2839 | u64 device_uuid; | 2865 | u64 device_uuid; |
2840 | u64 reserved_u64_1; | 2866 | u64 reserved_u64_1; |
@@ -2842,13 +2868,17 @@ struct meta_data_on_disk { | |||
2842 | u32 magic; | 2868 | u32 magic; |
2843 | u32 md_size_sect; | 2869 | u32 md_size_sect; |
2844 | u32 al_offset; /* offset to this block */ | 2870 | u32 al_offset; /* offset to this block */ |
2845 | u32 al_nr_extents; /* important for restoring the AL */ | 2871 | u32 al_nr_extents; /* important for restoring the AL (userspace) */ |
2846 | /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ | 2872 | /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ |
2847 | u32 bm_offset; /* offset to the bitmap, from here */ | 2873 | u32 bm_offset; /* offset to the bitmap, from here */ |
2848 | u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ | 2874 | u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ |
2849 | u32 la_peer_max_bio_size; /* last peer max_bio_size */ | 2875 | u32 la_peer_max_bio_size; /* last peer max_bio_size */ |
2850 | u32 reserved_u32[3]; | ||
2851 | 2876 | ||
2877 | /* see al_tr_number_to_on_disk_sector() */ | ||
2878 | u32 al_stripes; | ||
2879 | u32 al_stripe_size_4k; | ||
2880 | |||
2881 | u8 reserved_u8[4096 - (7*8 + 10*4)]; | ||
2852 | } __packed; | 2882 | } __packed; |
2853 | 2883 | ||
2854 | /** | 2884 | /** |
@@ -2861,6 +2891,10 @@ void drbd_md_sync(struct drbd_conf *mdev) | |||
2861 | sector_t sector; | 2891 | sector_t sector; |
2862 | int i; | 2892 | int i; |
2863 | 2893 | ||
2894 | /* Don't accidentally change the DRBD meta data layout. */ | ||
2895 | BUILD_BUG_ON(UI_SIZE != 4); | ||
2896 | BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096); | ||
2897 | |||
2864 | del_timer(&mdev->md_sync_timer); | 2898 | del_timer(&mdev->md_sync_timer); |
2865 | /* timer may be rearmed by drbd_md_mark_dirty() now. */ | 2899 | /* timer may be rearmed by drbd_md_mark_dirty() now. */ |
2866 | if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) | 2900 | if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) |
@@ -2875,9 +2909,9 @@ void drbd_md_sync(struct drbd_conf *mdev) | |||
2875 | if (!buffer) | 2909 | if (!buffer) |
2876 | goto out; | 2910 | goto out; |
2877 | 2911 | ||
2878 | memset(buffer, 0, 512); | 2912 | memset(buffer, 0, sizeof(*buffer)); |
2879 | 2913 | ||
2880 | buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); | 2914 | buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); |
2881 | for (i = UI_CURRENT; i < UI_SIZE; i++) | 2915 | for (i = UI_CURRENT; i < UI_SIZE; i++) |
2882 | buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); | 2916 | buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); |
2883 | buffer->flags = cpu_to_be32(mdev->ldev->md.flags); | 2917 | buffer->flags = cpu_to_be32(mdev->ldev->md.flags); |
@@ -2892,7 +2926,10 @@ void drbd_md_sync(struct drbd_conf *mdev) | |||
2892 | buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); | 2926 | buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); |
2893 | buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size); | 2927 | buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size); |
2894 | 2928 | ||
2895 | D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); | 2929 | buffer->al_stripes = cpu_to_be32(mdev->ldev->md.al_stripes); |
2930 | buffer->al_stripe_size_4k = cpu_to_be32(mdev->ldev->md.al_stripe_size_4k); | ||
2931 | |||
2932 | D_ASSERT(drbd_md_ss(mdev->ldev) == mdev->ldev->md.md_offset); | ||
2896 | sector = mdev->ldev->md.md_offset; | 2933 | sector = mdev->ldev->md.md_offset; |
2897 | 2934 | ||
2898 | if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { | 2935 | if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { |
@@ -2910,13 +2947,141 @@ out: | |||
2910 | put_ldev(mdev); | 2947 | put_ldev(mdev); |
2911 | } | 2948 | } |
2912 | 2949 | ||
2950 | static int check_activity_log_stripe_size(struct drbd_conf *mdev, | ||
2951 | struct meta_data_on_disk *on_disk, | ||
2952 | struct drbd_md *in_core) | ||
2953 | { | ||
2954 | u32 al_stripes = be32_to_cpu(on_disk->al_stripes); | ||
2955 | u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k); | ||
2956 | u64 al_size_4k; | ||
2957 | |||
2958 | /* both not set: default to old fixed size activity log */ | ||
2959 | if (al_stripes == 0 && al_stripe_size_4k == 0) { | ||
2960 | al_stripes = 1; | ||
2961 | al_stripe_size_4k = MD_32kB_SECT/8; | ||
2962 | } | ||
2963 | |||
2964 | /* some paranoia plausibility checks */ | ||
2965 | |||
2966 | /* we need both values to be set */ | ||
2967 | if (al_stripes == 0 || al_stripe_size_4k == 0) | ||
2968 | goto err; | ||
2969 | |||
2970 | al_size_4k = (u64)al_stripes * al_stripe_size_4k; | ||
2971 | |||
2972 | /* Upper limit of activity log area, to avoid potential overflow | ||
2973 | * problems in al_tr_number_to_on_disk_sector(). As right now, more | ||
2974 | * than 72 * 4k blocks total only increases the amount of history, | ||
2975 | * limiting this arbitrarily to 16 GB is not a real limitation ;-) */ | ||
2976 | if (al_size_4k > (16 * 1024 * 1024/4)) | ||
2977 | goto err; | ||
2978 | |||
2979 | /* Lower limit: we need at least 8 transaction slots (32kB) | ||
2980 | * to not break existing setups */ | ||
2981 | if (al_size_4k < MD_32kB_SECT/8) | ||
2982 | goto err; | ||
2983 | |||
2984 | in_core->al_stripe_size_4k = al_stripe_size_4k; | ||
2985 | in_core->al_stripes = al_stripes; | ||
2986 | in_core->al_size_4k = al_size_4k; | ||
2987 | |||
2988 | return 0; | ||
2989 | err: | ||
2990 | dev_err(DEV, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n", | ||
2991 | al_stripes, al_stripe_size_4k); | ||
2992 | return -EINVAL; | ||
2993 | } | ||
2994 | |||
2995 | static int check_offsets_and_sizes(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | ||
2996 | { | ||
2997 | sector_t capacity = drbd_get_capacity(bdev->md_bdev); | ||
2998 | struct drbd_md *in_core = &bdev->md; | ||
2999 | s32 on_disk_al_sect; | ||
3000 | s32 on_disk_bm_sect; | ||
3001 | |||
3002 | /* The on-disk size of the activity log, calculated from offsets, and | ||
3003 | * the size of the activity log calculated from the stripe settings, | ||
3004 | * should match. | ||
3005 | * Though we could relax this a bit: it is ok, if the striped activity log | ||
3006 | * fits in the available on-disk activity log size. | ||
3007 | * Right now, that would break how resize is implemented. | ||
3008 | * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware | ||
3009 | * of possible unused padding space in the on disk layout. */ | ||
3010 | if (in_core->al_offset < 0) { | ||
3011 | if (in_core->bm_offset > in_core->al_offset) | ||
3012 | goto err; | ||
3013 | on_disk_al_sect = -in_core->al_offset; | ||
3014 | on_disk_bm_sect = in_core->al_offset - in_core->bm_offset; | ||
3015 | } else { | ||
3016 | if (in_core->al_offset != MD_4kB_SECT) | ||
3017 | goto err; | ||
3018 | if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT) | ||
3019 | goto err; | ||
3020 | |||
3021 | on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT; | ||
3022 | on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset; | ||
3023 | } | ||
3024 | |||
3025 | /* old fixed size meta data is exactly that: fixed. */ | ||
3026 | if (in_core->meta_dev_idx >= 0) { | ||
3027 | if (in_core->md_size_sect != MD_128MB_SECT | ||
3028 | || in_core->al_offset != MD_4kB_SECT | ||
3029 | || in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT | ||
3030 | || in_core->al_stripes != 1 | ||
3031 | || in_core->al_stripe_size_4k != MD_32kB_SECT/8) | ||
3032 | goto err; | ||
3033 | } | ||
3034 | |||
3035 | if (capacity < in_core->md_size_sect) | ||
3036 | goto err; | ||
3037 | if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev)) | ||
3038 | goto err; | ||
3039 | |||
3040 | /* should be aligned, and at least 32k */ | ||
3041 | if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT)) | ||
3042 | goto err; | ||
3043 | |||
3044 | /* should fit (for now: exactly) into the available on-disk space; | ||
3045 | * overflow prevention is in check_activity_log_stripe_size() above. */ | ||
3046 | if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT) | ||
3047 | goto err; | ||
3048 | |||
3049 | /* again, should be aligned */ | ||
3050 | if (in_core->bm_offset & 7) | ||
3051 | goto err; | ||
3052 | |||
3053 | /* FIXME check for device grow with flex external meta data? */ | ||
3054 | |||
3055 | /* can the available bitmap space cover the last agreed device size? */ | ||
3056 | if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512) | ||
3057 | goto err; | ||
3058 | |||
3059 | return 0; | ||
3060 | |||
3061 | err: | ||
3062 | dev_err(DEV, "meta data offsets don't make sense: idx=%d " | ||
3063 | "al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, " | ||
3064 | "md_size_sect=%u, la_size=%llu, md_capacity=%llu\n", | ||
3065 | in_core->meta_dev_idx, | ||
3066 | in_core->al_stripes, in_core->al_stripe_size_4k, | ||
3067 | in_core->al_offset, in_core->bm_offset, in_core->md_size_sect, | ||
3068 | (unsigned long long)in_core->la_size_sect, | ||
3069 | (unsigned long long)capacity); | ||
3070 | |||
3071 | return -EINVAL; | ||
3072 | } | ||
3073 | |||
3074 | |||
2913 | /** | 3075 | /** |
2914 | * drbd_md_read() - Reads in the meta data super block | 3076 | * drbd_md_read() - Reads in the meta data super block |
2915 | * @mdev: DRBD device. | 3077 | * @mdev: DRBD device. |
2916 | * @bdev: Device from which the meta data should be read in. | 3078 | * @bdev: Device from which the meta data should be read in. |
2917 | * | 3079 | * |
2918 | * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case | 3080 | * Return NO_ERROR on success, and an enum drbd_ret_code in case |
2919 | * something goes wrong. | 3081 | * something goes wrong. |
3082 | * | ||
3083 | * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS, | ||
3084 | * even before @bdev is assigned to @mdev->ldev. | ||
2920 | */ | 3085 | */ |
2921 | int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | 3086 | int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) |
2922 | { | 3087 | { |
@@ -2924,12 +3089,17 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | |||
2924 | u32 magic, flags; | 3089 | u32 magic, flags; |
2925 | int i, rv = NO_ERROR; | 3090 | int i, rv = NO_ERROR; |
2926 | 3091 | ||
2927 | if (!get_ldev_if_state(mdev, D_ATTACHING)) | 3092 | if (mdev->state.disk != D_DISKLESS) |
2928 | return ERR_IO_MD_DISK; | 3093 | return ERR_DISK_CONFIGURED; |
2929 | 3094 | ||
2930 | buffer = drbd_md_get_buffer(mdev); | 3095 | buffer = drbd_md_get_buffer(mdev); |
2931 | if (!buffer) | 3096 | if (!buffer) |
2932 | goto out; | 3097 | return ERR_NOMEM; |
3098 | |||
3099 | /* First, figure out where our meta data superblock is located, | ||
3100 | * and read it. */ | ||
3101 | bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx; | ||
3102 | bdev->md.md_offset = drbd_md_ss(bdev); | ||
2933 | 3103 | ||
2934 | if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { | 3104 | if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { |
2935 | /* NOTE: can't do normal error processing here as this is | 3105 | /* NOTE: can't do normal error processing here as this is |
@@ -2948,45 +3118,51 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | |||
2948 | rv = ERR_MD_UNCLEAN; | 3118 | rv = ERR_MD_UNCLEAN; |
2949 | goto err; | 3119 | goto err; |
2950 | } | 3120 | } |
3121 | |||
3122 | rv = ERR_MD_INVALID; | ||
2951 | if (magic != DRBD_MD_MAGIC_08) { | 3123 | if (magic != DRBD_MD_MAGIC_08) { |
2952 | if (magic == DRBD_MD_MAGIC_07) | 3124 | if (magic == DRBD_MD_MAGIC_07) |
2953 | dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n"); | 3125 | dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n"); |
2954 | else | 3126 | else |
2955 | dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n"); | 3127 | dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n"); |
2956 | rv = ERR_MD_INVALID; | ||
2957 | goto err; | 3128 | goto err; |
2958 | } | 3129 | } |
2959 | if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { | 3130 | |
2960 | dev_err(DEV, "unexpected al_offset: %d (expected %d)\n", | 3131 | if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { |
2961 | be32_to_cpu(buffer->al_offset), bdev->md.al_offset); | 3132 | dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", |
2962 | rv = ERR_MD_INVALID; | 3133 | be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); |
2963 | goto err; | 3134 | goto err; |
2964 | } | 3135 | } |
3136 | |||
3137 | |||
3138 | /* convert to in_core endian */ | ||
3139 | bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect); | ||
3140 | for (i = UI_CURRENT; i < UI_SIZE; i++) | ||
3141 | bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); | ||
3142 | bdev->md.flags = be32_to_cpu(buffer->flags); | ||
3143 | bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); | ||
3144 | |||
3145 | bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect); | ||
3146 | bdev->md.al_offset = be32_to_cpu(buffer->al_offset); | ||
3147 | bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset); | ||
3148 | |||
3149 | if (check_activity_log_stripe_size(mdev, buffer, &bdev->md)) | ||
3150 | goto err; | ||
3151 | if (check_offsets_and_sizes(mdev, bdev)) | ||
3152 | goto err; | ||
3153 | |||
2965 | if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { | 3154 | if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { |
2966 | dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", | 3155 | dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", |
2967 | be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); | 3156 | be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); |
2968 | rv = ERR_MD_INVALID; | ||
2969 | goto err; | 3157 | goto err; |
2970 | } | 3158 | } |
2971 | if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { | 3159 | if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { |
2972 | dev_err(DEV, "unexpected md_size: %u (expected %u)\n", | 3160 | dev_err(DEV, "unexpected md_size: %u (expected %u)\n", |
2973 | be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); | 3161 | be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); |
2974 | rv = ERR_MD_INVALID; | ||
2975 | goto err; | 3162 | goto err; |
2976 | } | 3163 | } |
2977 | 3164 | ||
2978 | if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { | 3165 | rv = NO_ERROR; |
2979 | dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", | ||
2980 | be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); | ||
2981 | rv = ERR_MD_INVALID; | ||
2982 | goto err; | ||
2983 | } | ||
2984 | |||
2985 | bdev->md.la_size_sect = be64_to_cpu(buffer->la_size); | ||
2986 | for (i = UI_CURRENT; i < UI_SIZE; i++) | ||
2987 | bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); | ||
2988 | bdev->md.flags = be32_to_cpu(buffer->flags); | ||
2989 | bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); | ||
2990 | 3166 | ||
2991 | spin_lock_irq(&mdev->tconn->req_lock); | 3167 | spin_lock_irq(&mdev->tconn->req_lock); |
2992 | if (mdev->state.conn < C_CONNECTED) { | 3168 | if (mdev->state.conn < C_CONNECTED) { |
@@ -2999,8 +3175,6 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) | |||
2999 | 3175 | ||
3000 | err: | 3176 | err: |
3001 | drbd_md_put_buffer(mdev); | 3177 | drbd_md_put_buffer(mdev); |
3002 | out: | ||
3003 | put_ldev(mdev); | ||
3004 | 3178 | ||
3005 | return rv; | 3179 | return rv; |
3006 | } | 3180 | } |
@@ -3238,8 +3412,12 @@ static int w_go_diskless(struct drbd_work *w, int unused) | |||
3238 | * end up here after a failed attach, before ldev was even assigned. | 3412 | * end up here after a failed attach, before ldev was even assigned. |
3239 | */ | 3413 | */ |
3240 | if (mdev->bitmap && mdev->ldev) { | 3414 | if (mdev->bitmap && mdev->ldev) { |
3415 | /* An interrupted resync or similar is allowed to recounts bits | ||
3416 | * while we detach. | ||
3417 | * Any modifications would not be expected anymore, though. | ||
3418 | */ | ||
3241 | if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write, | 3419 | if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write, |
3242 | "detach", BM_LOCKED_MASK)) { | 3420 | "detach", BM_LOCKED_TEST_ALLOWED)) { |
3243 | if (test_bit(WAS_READ_ERROR, &mdev->flags)) { | 3421 | if (test_bit(WAS_READ_ERROR, &mdev->flags)) { |
3244 | drbd_md_set_flag(mdev, MDF_FULL_SYNC); | 3422 | drbd_md_set_flag(mdev, MDF_FULL_SYNC); |
3245 | drbd_md_sync(mdev); | 3423 | drbd_md_sync(mdev); |
@@ -3251,13 +3429,6 @@ static int w_go_diskless(struct drbd_work *w, int unused) | |||
3251 | return 0; | 3429 | return 0; |
3252 | } | 3430 | } |
3253 | 3431 | ||
3254 | void drbd_go_diskless(struct drbd_conf *mdev) | ||
3255 | { | ||
3256 | D_ASSERT(mdev->state.disk == D_FAILED); | ||
3257 | if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) | ||
3258 | drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless); | ||
3259 | } | ||
3260 | |||
3261 | /** | 3432 | /** |
3262 | * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap | 3433 | * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap |
3263 | * @mdev: DRBD device. | 3434 | * @mdev: DRBD device. |
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 2af26fc95280..9e3f441e7e84 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c | |||
@@ -696,37 +696,52 @@ out: | |||
696 | return 0; | 696 | return 0; |
697 | } | 697 | } |
698 | 698 | ||
699 | /* initializes the md.*_offset members, so we are able to find | 699 | /* Initializes the md.*_offset members, so we are able to find |
700 | * the on disk meta data */ | 700 | * the on disk meta data. |
701 | * | ||
702 | * We currently have two possible layouts: | ||
703 | * external: | ||
704 | * |----------- md_size_sect ------------------| | ||
705 | * [ 4k superblock ][ activity log ][ Bitmap ] | ||
706 | * | al_offset == 8 | | ||
707 | * | bm_offset = al_offset + X | | ||
708 | * ==> bitmap sectors = md_size_sect - bm_offset | ||
709 | * | ||
710 | * internal: | ||
711 | * |----------- md_size_sect ------------------| | ||
712 | * [data.....][ Bitmap ][ activity log ][ 4k superblock ] | ||
713 | * | al_offset < 0 | | ||
714 | * | bm_offset = al_offset - Y | | ||
715 | * ==> bitmap sectors = Y = al_offset - bm_offset | ||
716 | * | ||
717 | * Activity log size used to be fixed 32kB, | ||
718 | * but is about to become configurable. | ||
719 | */ | ||
701 | static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, | 720 | static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, |
702 | struct drbd_backing_dev *bdev) | 721 | struct drbd_backing_dev *bdev) |
703 | { | 722 | { |
704 | sector_t md_size_sect = 0; | 723 | sector_t md_size_sect = 0; |
705 | int meta_dev_idx; | 724 | unsigned int al_size_sect = bdev->md.al_size_4k * 8; |
706 | 725 | ||
707 | rcu_read_lock(); | 726 | bdev->md.md_offset = drbd_md_ss(bdev); |
708 | meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; | ||
709 | 727 | ||
710 | switch (meta_dev_idx) { | 728 | switch (bdev->md.meta_dev_idx) { |
711 | default: | 729 | default: |
712 | /* v07 style fixed size indexed meta data */ | 730 | /* v07 style fixed size indexed meta data */ |
713 | bdev->md.md_size_sect = MD_RESERVED_SECT; | 731 | bdev->md.md_size_sect = MD_128MB_SECT; |
714 | bdev->md.md_offset = drbd_md_ss__(mdev, bdev); | 732 | bdev->md.al_offset = MD_4kB_SECT; |
715 | bdev->md.al_offset = MD_AL_OFFSET; | 733 | bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; |
716 | bdev->md.bm_offset = MD_BM_OFFSET; | ||
717 | break; | 734 | break; |
718 | case DRBD_MD_INDEX_FLEX_EXT: | 735 | case DRBD_MD_INDEX_FLEX_EXT: |
719 | /* just occupy the full device; unit: sectors */ | 736 | /* just occupy the full device; unit: sectors */ |
720 | bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); | 737 | bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); |
721 | bdev->md.md_offset = 0; | 738 | bdev->md.al_offset = MD_4kB_SECT; |
722 | bdev->md.al_offset = MD_AL_OFFSET; | 739 | bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; |
723 | bdev->md.bm_offset = MD_BM_OFFSET; | ||
724 | break; | 740 | break; |
725 | case DRBD_MD_INDEX_INTERNAL: | 741 | case DRBD_MD_INDEX_INTERNAL: |
726 | case DRBD_MD_INDEX_FLEX_INT: | 742 | case DRBD_MD_INDEX_FLEX_INT: |
727 | bdev->md.md_offset = drbd_md_ss__(mdev, bdev); | ||
728 | /* al size is still fixed */ | 743 | /* al size is still fixed */ |
729 | bdev->md.al_offset = -MD_AL_SECTORS; | 744 | bdev->md.al_offset = -al_size_sect; |
730 | /* we need (slightly less than) ~ this much bitmap sectors: */ | 745 | /* we need (slightly less than) ~ this much bitmap sectors: */ |
731 | md_size_sect = drbd_get_capacity(bdev->backing_bdev); | 746 | md_size_sect = drbd_get_capacity(bdev->backing_bdev); |
732 | md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); | 747 | md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); |
@@ -735,14 +750,13 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, | |||
735 | 750 | ||
736 | /* plus the "drbd meta data super block", | 751 | /* plus the "drbd meta data super block", |
737 | * and the activity log; */ | 752 | * and the activity log; */ |
738 | md_size_sect += MD_BM_OFFSET; | 753 | md_size_sect += MD_4kB_SECT + al_size_sect; |
739 | 754 | ||
740 | bdev->md.md_size_sect = md_size_sect; | 755 | bdev->md.md_size_sect = md_size_sect; |
741 | /* bitmap offset is adjusted by 'super' block size */ | 756 | /* bitmap offset is adjusted by 'super' block size */ |
742 | bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; | 757 | bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT; |
743 | break; | 758 | break; |
744 | } | 759 | } |
745 | rcu_read_unlock(); | ||
746 | } | 760 | } |
747 | 761 | ||
748 | /* input size is expected to be in KB */ | 762 | /* input size is expected to be in KB */ |
@@ -805,7 +819,7 @@ void drbd_resume_io(struct drbd_conf *mdev) | |||
805 | enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) | 819 | enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) |
806 | { | 820 | { |
807 | sector_t prev_first_sect, prev_size; /* previous meta location */ | 821 | sector_t prev_first_sect, prev_size; /* previous meta location */ |
808 | sector_t la_size, u_size; | 822 | sector_t la_size_sect, u_size; |
809 | sector_t size; | 823 | sector_t size; |
810 | char ppb[10]; | 824 | char ppb[10]; |
811 | 825 | ||
@@ -828,7 +842,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
828 | 842 | ||
829 | prev_first_sect = drbd_md_first_sector(mdev->ldev); | 843 | prev_first_sect = drbd_md_first_sector(mdev->ldev); |
830 | prev_size = mdev->ldev->md.md_size_sect; | 844 | prev_size = mdev->ldev->md.md_size_sect; |
831 | la_size = mdev->ldev->md.la_size_sect; | 845 | la_size_sect = mdev->ldev->md.la_size_sect; |
832 | 846 | ||
833 | /* TODO: should only be some assert here, not (re)init... */ | 847 | /* TODO: should only be some assert here, not (re)init... */ |
834 | drbd_md_set_sector_offsets(mdev, mdev->ldev); | 848 | drbd_md_set_sector_offsets(mdev, mdev->ldev); |
@@ -864,7 +878,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
864 | if (rv == dev_size_error) | 878 | if (rv == dev_size_error) |
865 | goto out; | 879 | goto out; |
866 | 880 | ||
867 | la_size_changed = (la_size != mdev->ldev->md.la_size_sect); | 881 | la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect); |
868 | 882 | ||
869 | md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) | 883 | md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) |
870 | || prev_size != mdev->ldev->md.md_size_sect; | 884 | || prev_size != mdev->ldev->md.md_size_sect; |
@@ -886,9 +900,9 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
886 | drbd_md_mark_dirty(mdev); | 900 | drbd_md_mark_dirty(mdev); |
887 | } | 901 | } |
888 | 902 | ||
889 | if (size > la_size) | 903 | if (size > la_size_sect) |
890 | rv = grew; | 904 | rv = grew; |
891 | if (size < la_size) | 905 | if (size < la_size_sect) |
892 | rv = shrunk; | 906 | rv = shrunk; |
893 | out: | 907 | out: |
894 | lc_unlock(mdev->act_log); | 908 | lc_unlock(mdev->act_log); |
@@ -903,7 +917,7 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | |||
903 | sector_t u_size, int assume_peer_has_space) | 917 | sector_t u_size, int assume_peer_has_space) |
904 | { | 918 | { |
905 | sector_t p_size = mdev->p_size; /* partner's disk size. */ | 919 | sector_t p_size = mdev->p_size; /* partner's disk size. */ |
906 | sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ | 920 | sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */ |
907 | sector_t m_size; /* my size */ | 921 | sector_t m_size; /* my size */ |
908 | sector_t size = 0; | 922 | sector_t size = 0; |
909 | 923 | ||
@@ -917,8 +931,8 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | |||
917 | if (p_size && m_size) { | 931 | if (p_size && m_size) { |
918 | size = min_t(sector_t, p_size, m_size); | 932 | size = min_t(sector_t, p_size, m_size); |
919 | } else { | 933 | } else { |
920 | if (la_size) { | 934 | if (la_size_sect) { |
921 | size = la_size; | 935 | size = la_size_sect; |
922 | if (m_size && m_size < size) | 936 | if (m_size && m_size < size) |
923 | size = m_size; | 937 | size = m_size; |
924 | if (p_size && p_size < size) | 938 | if (p_size && p_size < size) |
@@ -1127,15 +1141,32 @@ static bool should_set_defaults(struct genl_info *info) | |||
1127 | return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); | 1141 | return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); |
1128 | } | 1142 | } |
1129 | 1143 | ||
1130 | static void enforce_disk_conf_limits(struct disk_conf *dc) | 1144 | static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev) |
1131 | { | 1145 | { |
1132 | if (dc->al_extents < DRBD_AL_EXTENTS_MIN) | 1146 | /* This is limited by 16 bit "slot" numbers, |
1133 | dc->al_extents = DRBD_AL_EXTENTS_MIN; | 1147 | * and by available on-disk context storage. |
1134 | if (dc->al_extents > DRBD_AL_EXTENTS_MAX) | 1148 | * |
1135 | dc->al_extents = DRBD_AL_EXTENTS_MAX; | 1149 | * Also (u16)~0 is special (denotes a "free" extent). |
1150 | * | ||
1151 | * One transaction occupies one 4kB on-disk block, | ||
1152 | * we have n such blocks in the on disk ring buffer, | ||
1153 | * the "current" transaction may fail (n-1), | ||
1154 | * and there is 919 slot numbers context information per transaction. | ||
1155 | * | ||
1156 | * 72 transaction blocks amounts to more than 2**16 context slots, | ||
1157 | * so cap there first. | ||
1158 | */ | ||
1159 | const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX; | ||
1160 | const unsigned int sufficient_on_disk = | ||
1161 | (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1) | ||
1162 | /AL_CONTEXT_PER_TRANSACTION; | ||
1163 | |||
1164 | unsigned int al_size_4k = bdev->md.al_size_4k; | ||
1165 | |||
1166 | if (al_size_4k > sufficient_on_disk) | ||
1167 | return max_al_nr; | ||
1136 | 1168 | ||
1137 | if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) | 1169 | return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION; |
1138 | dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; | ||
1139 | } | 1170 | } |
1140 | 1171 | ||
1141 | int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) | 1172 | int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) |
@@ -1182,7 +1213,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) | |||
1182 | if (!expect(new_disk_conf->resync_rate >= 1)) | 1213 | if (!expect(new_disk_conf->resync_rate >= 1)) |
1183 | new_disk_conf->resync_rate = 1; | 1214 | new_disk_conf->resync_rate = 1; |
1184 | 1215 | ||
1185 | enforce_disk_conf_limits(new_disk_conf); | 1216 | if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) |
1217 | new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; | ||
1218 | if (new_disk_conf->al_extents > drbd_al_extents_max(mdev->ldev)) | ||
1219 | new_disk_conf->al_extents = drbd_al_extents_max(mdev->ldev); | ||
1220 | |||
1221 | if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) | ||
1222 | new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; | ||
1186 | 1223 | ||
1187 | fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; | 1224 | fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; |
1188 | if (fifo_size != mdev->rs_plan_s->size) { | 1225 | if (fifo_size != mdev->rs_plan_s->size) { |
@@ -1330,7 +1367,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | |||
1330 | goto fail; | 1367 | goto fail; |
1331 | } | 1368 | } |
1332 | 1369 | ||
1333 | enforce_disk_conf_limits(new_disk_conf); | 1370 | if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) |
1371 | new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; | ||
1334 | 1372 | ||
1335 | new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ); | 1373 | new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ); |
1336 | if (!new_plan) { | 1374 | if (!new_plan) { |
@@ -1343,6 +1381,12 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | |||
1343 | goto fail; | 1381 | goto fail; |
1344 | } | 1382 | } |
1345 | 1383 | ||
1384 | write_lock_irq(&global_state_lock); | ||
1385 | retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after); | ||
1386 | write_unlock_irq(&global_state_lock); | ||
1387 | if (retcode != NO_ERROR) | ||
1388 | goto fail; | ||
1389 | |||
1346 | rcu_read_lock(); | 1390 | rcu_read_lock(); |
1347 | nc = rcu_dereference(mdev->tconn->net_conf); | 1391 | nc = rcu_dereference(mdev->tconn->net_conf); |
1348 | if (nc) { | 1392 | if (nc) { |
@@ -1399,8 +1443,16 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | |||
1399 | goto fail; | 1443 | goto fail; |
1400 | } | 1444 | } |
1401 | 1445 | ||
1402 | /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ | 1446 | /* Read our meta data super block early. |
1403 | drbd_md_set_sector_offsets(mdev, nbc); | 1447 | * This also sets other on-disk offsets. */ |
1448 | retcode = drbd_md_read(mdev, nbc); | ||
1449 | if (retcode != NO_ERROR) | ||
1450 | goto fail; | ||
1451 | |||
1452 | if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) | ||
1453 | new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; | ||
1454 | if (new_disk_conf->al_extents > drbd_al_extents_max(nbc)) | ||
1455 | new_disk_conf->al_extents = drbd_al_extents_max(nbc); | ||
1404 | 1456 | ||
1405 | if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { | 1457 | if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { |
1406 | dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", | 1458 | dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", |
@@ -1416,7 +1468,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | |||
1416 | min_md_device_sectors = (2<<10); | 1468 | min_md_device_sectors = (2<<10); |
1417 | } else { | 1469 | } else { |
1418 | max_possible_sectors = DRBD_MAX_SECTORS; | 1470 | max_possible_sectors = DRBD_MAX_SECTORS; |
1419 | min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1); | 1471 | min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1); |
1420 | } | 1472 | } |
1421 | 1473 | ||
1422 | if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { | 1474 | if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { |
@@ -1467,8 +1519,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | |||
1467 | if (!get_ldev_if_state(mdev, D_ATTACHING)) | 1519 | if (!get_ldev_if_state(mdev, D_ATTACHING)) |
1468 | goto force_diskless; | 1520 | goto force_diskless; |
1469 | 1521 | ||
1470 | drbd_md_set_sector_offsets(mdev, nbc); | ||
1471 | |||
1472 | if (!mdev->bitmap) { | 1522 | if (!mdev->bitmap) { |
1473 | if (drbd_bm_init(mdev)) { | 1523 | if (drbd_bm_init(mdev)) { |
1474 | retcode = ERR_NOMEM; | 1524 | retcode = ERR_NOMEM; |
@@ -1476,10 +1526,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | |||
1476 | } | 1526 | } |
1477 | } | 1527 | } |
1478 | 1528 | ||
1479 | retcode = drbd_md_read(mdev, nbc); | ||
1480 | if (retcode != NO_ERROR) | ||
1481 | goto force_diskless_dec; | ||
1482 | |||
1483 | if (mdev->state.conn < C_CONNECTED && | 1529 | if (mdev->state.conn < C_CONNECTED && |
1484 | mdev->state.role == R_PRIMARY && | 1530 | mdev->state.role == R_PRIMARY && |
1485 | (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { | 1531 | (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { |
@@ -2158,8 +2204,11 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool for | |||
2158 | return SS_SUCCESS; | 2204 | return SS_SUCCESS; |
2159 | case SS_PRIMARY_NOP: | 2205 | case SS_PRIMARY_NOP: |
2160 | /* Our state checking code wants to see the peer outdated. */ | 2206 | /* Our state checking code wants to see the peer outdated. */ |
2161 | rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, | 2207 | rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0); |
2162 | pdsk, D_OUTDATED), CS_VERBOSE); | 2208 | |
2209 | if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */ | ||
2210 | rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_VERBOSE); | ||
2211 | |||
2163 | break; | 2212 | break; |
2164 | case SS_CW_FAILED_BY_PEER: | 2213 | case SS_CW_FAILED_BY_PEER: |
2165 | /* The peer probably wants to see us outdated. */ | 2214 | /* The peer probably wants to see us outdated. */ |
@@ -2406,22 +2455,19 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) | |||
2406 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); | 2455 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); |
2407 | drbd_flush_workqueue(mdev); | 2456 | drbd_flush_workqueue(mdev); |
2408 | 2457 | ||
2409 | retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); | 2458 | /* If we happen to be C_STANDALONE R_SECONDARY, just change to |
2410 | 2459 | * D_INCONSISTENT, and set all bits in the bitmap. Otherwise, | |
2411 | if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION) | 2460 | * try to start a resync handshake as sync target for full sync. |
2412 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); | 2461 | */ |
2413 | 2462 | if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_SECONDARY) { | |
2414 | while (retcode == SS_NEED_CONNECTION) { | 2463 | retcode = drbd_request_state(mdev, NS(disk, D_INCONSISTENT)); |
2415 | spin_lock_irq(&mdev->tconn->req_lock); | 2464 | if (retcode >= SS_SUCCESS) { |
2416 | if (mdev->state.conn < C_CONNECTED) | 2465 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, |
2417 | retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); | 2466 | "set_n_write from invalidate", BM_LOCKED_MASK)) |
2418 | spin_unlock_irq(&mdev->tconn->req_lock); | 2467 | retcode = ERR_IO_MD_DISK; |
2419 | 2468 | } | |
2420 | if (retcode != SS_NEED_CONNECTION) | 2469 | } else |
2421 | break; | ||
2422 | |||
2423 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); | 2470 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); |
2424 | } | ||
2425 | drbd_resume_io(mdev); | 2471 | drbd_resume_io(mdev); |
2426 | 2472 | ||
2427 | out: | 2473 | out: |
@@ -2475,21 +2521,22 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) | |||
2475 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); | 2521 | wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); |
2476 | drbd_flush_workqueue(mdev); | 2522 | drbd_flush_workqueue(mdev); |
2477 | 2523 | ||
2478 | retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); | 2524 | /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits |
2479 | if (retcode < SS_SUCCESS) { | 2525 | * in the bitmap. Otherwise, try to start a resync handshake |
2480 | if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) { | 2526 | * as sync source for full sync. |
2481 | /* The peer will get a resync upon connect anyways. | 2527 | */ |
2482 | * Just make that into a full resync. */ | 2528 | if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_PRIMARY) { |
2483 | retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); | 2529 | /* The peer will get a resync upon connect anyways. Just make that |
2484 | if (retcode >= SS_SUCCESS) { | 2530 | into a full resync. */ |
2485 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, | 2531 | retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); |
2486 | "set_n_write from invalidate_peer", | 2532 | if (retcode >= SS_SUCCESS) { |
2487 | BM_LOCKED_SET_ALLOWED)) | 2533 | if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, |
2488 | retcode = ERR_IO_MD_DISK; | 2534 | "set_n_write from invalidate_peer", |
2489 | } | 2535 | BM_LOCKED_SET_ALLOWED)) |
2490 | } else | 2536 | retcode = ERR_IO_MD_DISK; |
2491 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); | 2537 | } |
2492 | } | 2538 | } else |
2539 | retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); | ||
2493 | drbd_resume_io(mdev); | 2540 | drbd_resume_io(mdev); |
2494 | 2541 | ||
2495 | out: | 2542 | out: |
@@ -3162,6 +3209,7 @@ static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev) | |||
3162 | CS_VERBOSE + CS_WAIT_COMPLETE); | 3209 | CS_VERBOSE + CS_WAIT_COMPLETE); |
3163 | idr_remove(&mdev->tconn->volumes, mdev->vnr); | 3210 | idr_remove(&mdev->tconn->volumes, mdev->vnr); |
3164 | idr_remove(&minors, mdev_to_minor(mdev)); | 3211 | idr_remove(&minors, mdev_to_minor(mdev)); |
3212 | destroy_workqueue(mdev->submit.wq); | ||
3165 | del_gendisk(mdev->vdisk); | 3213 | del_gendisk(mdev->vdisk); |
3166 | synchronize_rcu(); | 3214 | synchronize_rcu(); |
3167 | kref_put(&mdev->kref, &drbd_minor_destroy); | 3215 | kref_put(&mdev->kref, &drbd_minor_destroy); |
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 928adb815b09..bf31d41dbaad 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c | |||
@@ -313,8 +313,14 @@ static int drbd_seq_show(struct seq_file *seq, void *v) | |||
313 | 313 | ||
314 | static int drbd_proc_open(struct inode *inode, struct file *file) | 314 | static int drbd_proc_open(struct inode *inode, struct file *file) |
315 | { | 315 | { |
316 | if (try_module_get(THIS_MODULE)) | 316 | int err; |
317 | return single_open(file, drbd_seq_show, PDE_DATA(inode)); | 317 | |
318 | if (try_module_get(THIS_MODULE)) { | ||
319 | err = single_open(file, drbd_seq_show, PDE_DATA(inode)); | ||
320 | if (err) | ||
321 | module_put(THIS_MODULE); | ||
322 | return err; | ||
323 | } | ||
318 | return -ENODEV; | 324 | return -ENODEV; |
319 | } | 325 | } |
320 | 326 | ||
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 83c5ae0ed56b..4222affff488 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c | |||
@@ -850,6 +850,7 @@ int drbd_connected(struct drbd_conf *mdev) | |||
850 | err = drbd_send_current_state(mdev); | 850 | err = drbd_send_current_state(mdev); |
851 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); | 851 | clear_bit(USE_DEGR_WFC_T, &mdev->flags); |
852 | clear_bit(RESIZE_PENDING, &mdev->flags); | 852 | clear_bit(RESIZE_PENDING, &mdev->flags); |
853 | atomic_set(&mdev->ap_in_flight, 0); | ||
853 | mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ | 854 | mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ |
854 | return err; | 855 | return err; |
855 | } | 856 | } |
@@ -2266,7 +2267,7 @@ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi) | |||
2266 | drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size); | 2267 | drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size); |
2267 | peer_req->flags |= EE_CALL_AL_COMPLETE_IO; | 2268 | peer_req->flags |= EE_CALL_AL_COMPLETE_IO; |
2268 | peer_req->flags &= ~EE_MAY_SET_IN_SYNC; | 2269 | peer_req->flags &= ~EE_MAY_SET_IN_SYNC; |
2269 | drbd_al_begin_io(mdev, &peer_req->i); | 2270 | drbd_al_begin_io(mdev, &peer_req->i, true); |
2270 | } | 2271 | } |
2271 | 2272 | ||
2272 | err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR); | 2273 | err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR); |
@@ -2662,7 +2663,6 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) | |||
2662 | if (hg == -1 && mdev->state.role == R_PRIMARY) { | 2663 | if (hg == -1 && mdev->state.role == R_PRIMARY) { |
2663 | enum drbd_state_rv rv2; | 2664 | enum drbd_state_rv rv2; |
2664 | 2665 | ||
2665 | drbd_set_role(mdev, R_SECONDARY, 0); | ||
2666 | /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, | 2666 | /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, |
2667 | * we might be here in C_WF_REPORT_PARAMS which is transient. | 2667 | * we might be here in C_WF_REPORT_PARAMS which is transient. |
2668 | * we do not need to wait for the after state change work either. */ | 2668 | * we do not need to wait for the after state change work either. */ |
@@ -3993,7 +3993,7 @@ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi) | |||
3993 | 3993 | ||
3994 | clear_bit(DISCARD_MY_DATA, &mdev->flags); | 3994 | clear_bit(DISCARD_MY_DATA, &mdev->flags); |
3995 | 3995 | ||
3996 | drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ | 3996 | drbd_md_sync(mdev); /* update connected indicator, la_size_sect, ... */ |
3997 | 3997 | ||
3998 | return 0; | 3998 | return 0; |
3999 | } | 3999 | } |
@@ -4660,8 +4660,8 @@ static int drbd_do_features(struct drbd_tconn *tconn) | |||
4660 | #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) | 4660 | #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) |
4661 | static int drbd_do_auth(struct drbd_tconn *tconn) | 4661 | static int drbd_do_auth(struct drbd_tconn *tconn) |
4662 | { | 4662 | { |
4663 | dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); | 4663 | conn_err(tconn, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); |
4664 | dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); | 4664 | conn_err(tconn, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); |
4665 | return -1; | 4665 | return -1; |
4666 | } | 4666 | } |
4667 | #else | 4667 | #else |
@@ -5258,9 +5258,11 @@ int drbd_asender(struct drbd_thread *thi) | |||
5258 | bool ping_timeout_active = false; | 5258 | bool ping_timeout_active = false; |
5259 | struct net_conf *nc; | 5259 | struct net_conf *nc; |
5260 | int ping_timeo, tcp_cork, ping_int; | 5260 | int ping_timeo, tcp_cork, ping_int; |
5261 | struct sched_param param = { .sched_priority = 2 }; | ||
5261 | 5262 | ||
5262 | current->policy = SCHED_RR; /* Make this a realtime task! */ | 5263 | rv = sched_setscheduler(current, SCHED_RR, ¶m); |
5263 | current->rt_priority = 2; /* more important than all other tasks */ | 5264 | if (rv < 0) |
5265 | conn_err(tconn, "drbd_asender: ERROR set priority, ret=%d\n", rv); | ||
5264 | 5266 | ||
5265 | while (get_t_state(thi) == RUNNING) { | 5267 | while (get_t_state(thi) == RUNNING) { |
5266 | drbd_thread_current_set_cpu(thi); | 5268 | drbd_thread_current_set_cpu(thi); |
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 2b8303ad63c9..c24379ffd4e3 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c | |||
@@ -34,14 +34,14 @@ | |||
34 | static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size); | 34 | static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size); |
35 | 35 | ||
36 | /* Update disk stats at start of I/O request */ | 36 | /* Update disk stats at start of I/O request */ |
37 | static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) | 37 | static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req) |
38 | { | 38 | { |
39 | const int rw = bio_data_dir(bio); | 39 | const int rw = bio_data_dir(req->master_bio); |
40 | int cpu; | 40 | int cpu; |
41 | cpu = part_stat_lock(); | 41 | cpu = part_stat_lock(); |
42 | part_round_stats(cpu, &mdev->vdisk->part0); | 42 | part_round_stats(cpu, &mdev->vdisk->part0); |
43 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); | 43 | part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); |
44 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); | 44 | part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], req->i.size >> 9); |
45 | (void) cpu; /* The macro invocations above want the cpu argument, I do not like | 45 | (void) cpu; /* The macro invocations above want the cpu argument, I do not like |
46 | the compiler warning about cpu only assigned but never used... */ | 46 | the compiler warning about cpu only assigned but never used... */ |
47 | part_inc_in_flight(&mdev->vdisk->part0, rw); | 47 | part_inc_in_flight(&mdev->vdisk->part0, rw); |
@@ -263,8 +263,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) | |||
263 | else | 263 | else |
264 | root = &mdev->read_requests; | 264 | root = &mdev->read_requests; |
265 | drbd_remove_request_interval(root, req); | 265 | drbd_remove_request_interval(root, req); |
266 | } else if (!(s & RQ_POSTPONED)) | 266 | } |
267 | D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); | ||
268 | 267 | ||
269 | /* Before we can signal completion to the upper layers, | 268 | /* Before we can signal completion to the upper layers, |
270 | * we may need to close the current transfer log epoch. | 269 | * we may need to close the current transfer log epoch. |
@@ -755,6 +754,11 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, | |||
755 | D_ASSERT(req->rq_state & RQ_NET_PENDING); | 754 | D_ASSERT(req->rq_state & RQ_NET_PENDING); |
756 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); | 755 | mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); |
757 | break; | 756 | break; |
757 | |||
758 | case QUEUE_AS_DRBD_BARRIER: | ||
759 | start_new_tl_epoch(mdev->tconn); | ||
760 | mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE); | ||
761 | break; | ||
758 | }; | 762 | }; |
759 | 763 | ||
760 | return rv; | 764 | return rv; |
@@ -861,8 +865,10 @@ static void maybe_pull_ahead(struct drbd_conf *mdev) | |||
861 | bool congested = false; | 865 | bool congested = false; |
862 | enum drbd_on_congestion on_congestion; | 866 | enum drbd_on_congestion on_congestion; |
863 | 867 | ||
868 | rcu_read_lock(); | ||
864 | nc = rcu_dereference(tconn->net_conf); | 869 | nc = rcu_dereference(tconn->net_conf); |
865 | on_congestion = nc ? nc->on_congestion : OC_BLOCK; | 870 | on_congestion = nc ? nc->on_congestion : OC_BLOCK; |
871 | rcu_read_unlock(); | ||
866 | if (on_congestion == OC_BLOCK || | 872 | if (on_congestion == OC_BLOCK || |
867 | tconn->agreed_pro_version < 96) | 873 | tconn->agreed_pro_version < 96) |
868 | return; | 874 | return; |
@@ -956,14 +962,8 @@ static int drbd_process_write_request(struct drbd_request *req) | |||
956 | struct drbd_conf *mdev = req->w.mdev; | 962 | struct drbd_conf *mdev = req->w.mdev; |
957 | int remote, send_oos; | 963 | int remote, send_oos; |
958 | 964 | ||
959 | rcu_read_lock(); | ||
960 | remote = drbd_should_do_remote(mdev->state); | 965 | remote = drbd_should_do_remote(mdev->state); |
961 | if (remote) { | ||
962 | maybe_pull_ahead(mdev); | ||
963 | remote = drbd_should_do_remote(mdev->state); | ||
964 | } | ||
965 | send_oos = drbd_should_send_out_of_sync(mdev->state); | 966 | send_oos = drbd_should_send_out_of_sync(mdev->state); |
966 | rcu_read_unlock(); | ||
967 | 967 | ||
968 | /* Need to replicate writes. Unless it is an empty flush, | 968 | /* Need to replicate writes. Unless it is an empty flush, |
969 | * which is better mapped to a DRBD P_BARRIER packet, | 969 | * which is better mapped to a DRBD P_BARRIER packet, |
@@ -975,8 +975,8 @@ static int drbd_process_write_request(struct drbd_request *req) | |||
975 | /* The only size==0 bios we expect are empty flushes. */ | 975 | /* The only size==0 bios we expect are empty flushes. */ |
976 | D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH); | 976 | D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH); |
977 | if (remote) | 977 | if (remote) |
978 | start_new_tl_epoch(mdev->tconn); | 978 | _req_mod(req, QUEUE_AS_DRBD_BARRIER); |
979 | return 0; | 979 | return remote; |
980 | } | 980 | } |
981 | 981 | ||
982 | if (!remote && !send_oos) | 982 | if (!remote && !send_oos) |
@@ -1020,12 +1020,24 @@ drbd_submit_req_private_bio(struct drbd_request *req) | |||
1020 | bio_endio(bio, -EIO); | 1020 | bio_endio(bio, -EIO); |
1021 | } | 1021 | } |
1022 | 1022 | ||
1023 | void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) | 1023 | static void drbd_queue_write(struct drbd_conf *mdev, struct drbd_request *req) |
1024 | { | 1024 | { |
1025 | const int rw = bio_rw(bio); | 1025 | spin_lock(&mdev->submit.lock); |
1026 | struct bio_and_error m = { NULL, }; | 1026 | list_add_tail(&req->tl_requests, &mdev->submit.writes); |
1027 | spin_unlock(&mdev->submit.lock); | ||
1028 | queue_work(mdev->submit.wq, &mdev->submit.worker); | ||
1029 | } | ||
1030 | |||
1031 | /* returns the new drbd_request pointer, if the caller is expected to | ||
1032 | * drbd_send_and_submit() it (to save latency), or NULL if we queued the | ||
1033 | * request on the submitter thread. | ||
1034 | * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request. | ||
1035 | */ | ||
1036 | struct drbd_request * | ||
1037 | drbd_request_prepare(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) | ||
1038 | { | ||
1039 | const int rw = bio_data_dir(bio); | ||
1027 | struct drbd_request *req; | 1040 | struct drbd_request *req; |
1028 | bool no_remote = false; | ||
1029 | 1041 | ||
1030 | /* allocate outside of all locks; */ | 1042 | /* allocate outside of all locks; */ |
1031 | req = drbd_req_new(mdev, bio); | 1043 | req = drbd_req_new(mdev, bio); |
@@ -1035,7 +1047,7 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long | |||
1035 | * if user cannot handle io errors, that's not our business. */ | 1047 | * if user cannot handle io errors, that's not our business. */ |
1036 | dev_err(DEV, "could not kmalloc() req\n"); | 1048 | dev_err(DEV, "could not kmalloc() req\n"); |
1037 | bio_endio(bio, -ENOMEM); | 1049 | bio_endio(bio, -ENOMEM); |
1038 | return; | 1050 | return ERR_PTR(-ENOMEM); |
1039 | } | 1051 | } |
1040 | req->start_time = start_time; | 1052 | req->start_time = start_time; |
1041 | 1053 | ||
@@ -1044,28 +1056,40 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long | |||
1044 | req->private_bio = NULL; | 1056 | req->private_bio = NULL; |
1045 | } | 1057 | } |
1046 | 1058 | ||
1047 | /* For WRITES going to the local disk, grab a reference on the target | 1059 | /* Update disk stats */ |
1048 | * extent. This waits for any resync activity in the corresponding | 1060 | _drbd_start_io_acct(mdev, req); |
1049 | * resync extent to finish, and, if necessary, pulls in the target | 1061 | |
1050 | * extent into the activity log, which involves further disk io because | ||
1051 | * of transactional on-disk meta data updates. | ||
1052 | * Empty flushes don't need to go into the activity log, they can only | ||
1053 | * flush data for pending writes which are already in there. */ | ||
1054 | if (rw == WRITE && req->private_bio && req->i.size | 1062 | if (rw == WRITE && req->private_bio && req->i.size |
1055 | && !test_bit(AL_SUSPENDED, &mdev->flags)) { | 1063 | && !test_bit(AL_SUSPENDED, &mdev->flags)) { |
1064 | if (!drbd_al_begin_io_fastpath(mdev, &req->i)) { | ||
1065 | drbd_queue_write(mdev, req); | ||
1066 | return NULL; | ||
1067 | } | ||
1056 | req->rq_state |= RQ_IN_ACT_LOG; | 1068 | req->rq_state |= RQ_IN_ACT_LOG; |
1057 | drbd_al_begin_io(mdev, &req->i); | ||
1058 | } | 1069 | } |
1059 | 1070 | ||
1071 | return req; | ||
1072 | } | ||
1073 | |||
1074 | static void drbd_send_and_submit(struct drbd_conf *mdev, struct drbd_request *req) | ||
1075 | { | ||
1076 | const int rw = bio_rw(req->master_bio); | ||
1077 | struct bio_and_error m = { NULL, }; | ||
1078 | bool no_remote = false; | ||
1079 | |||
1060 | spin_lock_irq(&mdev->tconn->req_lock); | 1080 | spin_lock_irq(&mdev->tconn->req_lock); |
1061 | if (rw == WRITE) { | 1081 | if (rw == WRITE) { |
1062 | /* This may temporarily give up the req_lock, | 1082 | /* This may temporarily give up the req_lock, |
1063 | * but will re-aquire it before it returns here. | 1083 | * but will re-aquire it before it returns here. |
1064 | * Needs to be before the check on drbd_suspended() */ | 1084 | * Needs to be before the check on drbd_suspended() */ |
1065 | complete_conflicting_writes(req); | 1085 | complete_conflicting_writes(req); |
1086 | /* no more giving up req_lock from now on! */ | ||
1087 | |||
1088 | /* check for congestion, and potentially stop sending | ||
1089 | * full data updates, but start sending "dirty bits" only. */ | ||
1090 | maybe_pull_ahead(mdev); | ||
1066 | } | 1091 | } |
1067 | 1092 | ||
1068 | /* no more giving up req_lock from now on! */ | ||
1069 | 1093 | ||
1070 | if (drbd_suspended(mdev)) { | 1094 | if (drbd_suspended(mdev)) { |
1071 | /* push back and retry: */ | 1095 | /* push back and retry: */ |
@@ -1078,9 +1102,6 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long | |||
1078 | goto out; | 1102 | goto out; |
1079 | } | 1103 | } |
1080 | 1104 | ||
1081 | /* Update disk stats */ | ||
1082 | _drbd_start_io_acct(mdev, req, bio); | ||
1083 | |||
1084 | /* We fail READ/READA early, if we can not serve it. | 1105 | /* We fail READ/READA early, if we can not serve it. |
1085 | * We must do this before req is registered on any lists. | 1106 | * We must do this before req is registered on any lists. |
1086 | * Otherwise, drbd_req_complete() will queue failed READ for retry. */ | 1107 | * Otherwise, drbd_req_complete() will queue failed READ for retry. */ |
@@ -1137,7 +1158,116 @@ out: | |||
1137 | 1158 | ||
1138 | if (m.bio) | 1159 | if (m.bio) |
1139 | complete_master_bio(mdev, &m); | 1160 | complete_master_bio(mdev, &m); |
1140 | return; | 1161 | } |
1162 | |||
1163 | void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) | ||
1164 | { | ||
1165 | struct drbd_request *req = drbd_request_prepare(mdev, bio, start_time); | ||
1166 | if (IS_ERR_OR_NULL(req)) | ||
1167 | return; | ||
1168 | drbd_send_and_submit(mdev, req); | ||
1169 | } | ||
1170 | |||
1171 | static void submit_fast_path(struct drbd_conf *mdev, struct list_head *incoming) | ||
1172 | { | ||
1173 | struct drbd_request *req, *tmp; | ||
1174 | list_for_each_entry_safe(req, tmp, incoming, tl_requests) { | ||
1175 | const int rw = bio_data_dir(req->master_bio); | ||
1176 | |||
1177 | if (rw == WRITE /* rw != WRITE should not even end up here! */ | ||
1178 | && req->private_bio && req->i.size | ||
1179 | && !test_bit(AL_SUSPENDED, &mdev->flags)) { | ||
1180 | if (!drbd_al_begin_io_fastpath(mdev, &req->i)) | ||
1181 | continue; | ||
1182 | |||
1183 | req->rq_state |= RQ_IN_ACT_LOG; | ||
1184 | } | ||
1185 | |||
1186 | list_del_init(&req->tl_requests); | ||
1187 | drbd_send_and_submit(mdev, req); | ||
1188 | } | ||
1189 | } | ||
1190 | |||
1191 | static bool prepare_al_transaction_nonblock(struct drbd_conf *mdev, | ||
1192 | struct list_head *incoming, | ||
1193 | struct list_head *pending) | ||
1194 | { | ||
1195 | struct drbd_request *req, *tmp; | ||
1196 | int wake = 0; | ||
1197 | int err; | ||
1198 | |||
1199 | spin_lock_irq(&mdev->al_lock); | ||
1200 | list_for_each_entry_safe(req, tmp, incoming, tl_requests) { | ||
1201 | err = drbd_al_begin_io_nonblock(mdev, &req->i); | ||
1202 | if (err == -EBUSY) | ||
1203 | wake = 1; | ||
1204 | if (err) | ||
1205 | continue; | ||
1206 | req->rq_state |= RQ_IN_ACT_LOG; | ||
1207 | list_move_tail(&req->tl_requests, pending); | ||
1208 | } | ||
1209 | spin_unlock_irq(&mdev->al_lock); | ||
1210 | if (wake) | ||
1211 | wake_up(&mdev->al_wait); | ||
1212 | |||
1213 | return !list_empty(pending); | ||
1214 | } | ||
1215 | |||
1216 | void do_submit(struct work_struct *ws) | ||
1217 | { | ||
1218 | struct drbd_conf *mdev = container_of(ws, struct drbd_conf, submit.worker); | ||
1219 | LIST_HEAD(incoming); | ||
1220 | LIST_HEAD(pending); | ||
1221 | struct drbd_request *req, *tmp; | ||
1222 | |||
1223 | for (;;) { | ||
1224 | spin_lock(&mdev->submit.lock); | ||
1225 | list_splice_tail_init(&mdev->submit.writes, &incoming); | ||
1226 | spin_unlock(&mdev->submit.lock); | ||
1227 | |||
1228 | submit_fast_path(mdev, &incoming); | ||
1229 | if (list_empty(&incoming)) | ||
1230 | break; | ||
1231 | |||
1232 | wait_event(mdev->al_wait, prepare_al_transaction_nonblock(mdev, &incoming, &pending)); | ||
1233 | /* Maybe more was queued, while we prepared the transaction? | ||
1234 | * Try to stuff them into this transaction as well. | ||
1235 | * Be strictly non-blocking here, no wait_event, we already | ||
1236 | * have something to commit. | ||
1237 | * Stop if we don't make any more progres. | ||
1238 | */ | ||
1239 | for (;;) { | ||
1240 | LIST_HEAD(more_pending); | ||
1241 | LIST_HEAD(more_incoming); | ||
1242 | bool made_progress; | ||
1243 | |||
1244 | /* It is ok to look outside the lock, | ||
1245 | * it's only an optimization anyways */ | ||
1246 | if (list_empty(&mdev->submit.writes)) | ||
1247 | break; | ||
1248 | |||
1249 | spin_lock(&mdev->submit.lock); | ||
1250 | list_splice_tail_init(&mdev->submit.writes, &more_incoming); | ||
1251 | spin_unlock(&mdev->submit.lock); | ||
1252 | |||
1253 | if (list_empty(&more_incoming)) | ||
1254 | break; | ||
1255 | |||
1256 | made_progress = prepare_al_transaction_nonblock(mdev, &more_incoming, &more_pending); | ||
1257 | |||
1258 | list_splice_tail_init(&more_pending, &pending); | ||
1259 | list_splice_tail_init(&more_incoming, &incoming); | ||
1260 | |||
1261 | if (!made_progress) | ||
1262 | break; | ||
1263 | } | ||
1264 | drbd_al_begin_io_commit(mdev, false); | ||
1265 | |||
1266 | list_for_each_entry_safe(req, tmp, &pending, tl_requests) { | ||
1267 | list_del_init(&req->tl_requests); | ||
1268 | drbd_send_and_submit(mdev, req); | ||
1269 | } | ||
1270 | } | ||
1141 | } | 1271 | } |
1142 | 1272 | ||
1143 | void drbd_make_request(struct request_queue *q, struct bio *bio) | 1273 | void drbd_make_request(struct request_queue *q, struct bio *bio) |
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index c08d22964d06..978cb1addc98 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h | |||
@@ -88,6 +88,14 @@ enum drbd_req_event { | |||
88 | QUEUE_FOR_NET_READ, | 88 | QUEUE_FOR_NET_READ, |
89 | QUEUE_FOR_SEND_OOS, | 89 | QUEUE_FOR_SEND_OOS, |
90 | 90 | ||
91 | /* An empty flush is queued as P_BARRIER, | ||
92 | * which will cause it to complete "successfully", | ||
93 | * even if the local disk flush failed. | ||
94 | * | ||
95 | * Just like "real" requests, empty flushes (blkdev_issue_flush()) will | ||
96 | * only see an error if neither local nor remote data is reachable. */ | ||
97 | QUEUE_AS_DRBD_BARRIER, | ||
98 | |||
91 | SEND_CANCELED, | 99 | SEND_CANCELED, |
92 | SEND_FAILED, | 100 | SEND_FAILED, |
93 | HANDED_OVER_TO_NETWORK, | 101 | HANDED_OVER_TO_NETWORK, |
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 0fe220cfb9e9..90c5be2b1d30 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c | |||
@@ -570,6 +570,13 @@ is_valid_state(struct drbd_conf *mdev, union drbd_state ns) | |||
570 | mdev->tconn->agreed_pro_version < 88) | 570 | mdev->tconn->agreed_pro_version < 88) |
571 | rv = SS_NOT_SUPPORTED; | 571 | rv = SS_NOT_SUPPORTED; |
572 | 572 | ||
573 | else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) | ||
574 | rv = SS_NO_UP_TO_DATE_DISK; | ||
575 | |||
576 | else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && | ||
577 | ns.pdsk == D_UNKNOWN) | ||
578 | rv = SS_NEED_CONNECTION; | ||
579 | |||
573 | else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) | 580 | else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) |
574 | rv = SS_CONNECTED_OUTDATES; | 581 | rv = SS_CONNECTED_OUTDATES; |
575 | 582 | ||
@@ -635,6 +642,10 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_t | |||
635 | && os.conn < C_WF_REPORT_PARAMS) | 642 | && os.conn < C_WF_REPORT_PARAMS) |
636 | rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ | 643 | rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ |
637 | 644 | ||
645 | if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED && | ||
646 | os.conn < C_CONNECTED && os.pdsk > D_OUTDATED) | ||
647 | rv = SS_OUTDATE_WO_CONN; | ||
648 | |||
638 | return rv; | 649 | return rv; |
639 | } | 650 | } |
640 | 651 | ||
@@ -1377,13 +1388,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | |||
1377 | &drbd_bmio_set_n_write, &abw_start_sync, | 1388 | &drbd_bmio_set_n_write, &abw_start_sync, |
1378 | "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); | 1389 | "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); |
1379 | 1390 | ||
1380 | /* We are invalidating our self... */ | ||
1381 | if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && | ||
1382 | os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) | ||
1383 | /* other bitmap operation expected during this phase */ | ||
1384 | drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, | ||
1385 | "set_n_write from invalidate", BM_LOCKED_MASK); | ||
1386 | |||
1387 | /* first half of local IO error, failure to attach, | 1391 | /* first half of local IO error, failure to attach, |
1388 | * or administrative detach */ | 1392 | * or administrative detach */ |
1389 | if (os.disk != D_FAILED && ns.disk == D_FAILED) { | 1393 | if (os.disk != D_FAILED && ns.disk == D_FAILED) { |
@@ -1748,13 +1752,9 @@ _conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state | |||
1748 | if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags)) | 1752 | if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags)) |
1749 | return SS_CW_FAILED_BY_PEER; | 1753 | return SS_CW_FAILED_BY_PEER; |
1750 | 1754 | ||
1751 | rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR; | 1755 | rv = conn_is_valid_transition(tconn, mask, val, 0); |
1752 | 1756 | if (rv == SS_SUCCESS && tconn->cstate == C_WF_REPORT_PARAMS) | |
1753 | if (rv == SS_UNKNOWN_ERROR) | 1757 | rv = SS_UNKNOWN_ERROR; /* continue waiting */ |
1754 | rv = conn_is_valid_transition(tconn, mask, val, 0); | ||
1755 | |||
1756 | if (rv == SS_SUCCESS) | ||
1757 | rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ | ||
1758 | 1758 | ||
1759 | return rv; | 1759 | return rv; |
1760 | } | 1760 | } |
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c index 9a664bd27404..58e08ff2b2ce 100644 --- a/drivers/block/drbd/drbd_strings.c +++ b/drivers/block/drbd/drbd_strings.c | |||
@@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = { | |||
89 | [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", | 89 | [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", |
90 | [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", | 90 | [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", |
91 | [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", | 91 | [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", |
92 | [-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer", | ||
92 | [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config", | 93 | [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config", |
93 | }; | 94 | }; |
94 | 95 | ||
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 424dc7bdf9b7..891c0ecaa292 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c | |||
@@ -89,7 +89,8 @@ void drbd_md_io_complete(struct bio *bio, int error) | |||
89 | md_io->done = 1; | 89 | md_io->done = 1; |
90 | wake_up(&mdev->misc_wait); | 90 | wake_up(&mdev->misc_wait); |
91 | bio_put(bio); | 91 | bio_put(bio); |
92 | put_ldev(mdev); | 92 | if (mdev->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */ |
93 | put_ldev(mdev); | ||
93 | } | 94 | } |
94 | 95 | ||
95 | /* reads on behalf of the partner, | 96 | /* reads on behalf of the partner, |
@@ -1410,7 +1411,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel) | |||
1410 | struct drbd_conf *mdev = w->mdev; | 1411 | struct drbd_conf *mdev = w->mdev; |
1411 | 1412 | ||
1412 | if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) | 1413 | if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) |
1413 | drbd_al_begin_io(mdev, &req->i); | 1414 | drbd_al_begin_io(mdev, &req->i, false); |
1414 | 1415 | ||
1415 | drbd_req_make_private_bio(req, req->master_bio); | 1416 | drbd_req_make_private_bio(req, req->master_bio); |
1416 | req->private_bio->bi_bdev = mdev->ldev->backing_bdev; | 1417 | req->private_bio->bi_bdev = mdev->ldev->backing_bdev; |
@@ -1425,7 +1426,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev) | |||
1425 | int resync_after; | 1426 | int resync_after; |
1426 | 1427 | ||
1427 | while (1) { | 1428 | while (1) { |
1428 | if (!odev->ldev) | 1429 | if (!odev->ldev || odev->state.disk == D_DISKLESS) |
1429 | return 1; | 1430 | return 1; |
1430 | rcu_read_lock(); | 1431 | rcu_read_lock(); |
1431 | resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; | 1432 | resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; |
@@ -1433,7 +1434,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev) | |||
1433 | if (resync_after == -1) | 1434 | if (resync_after == -1) |
1434 | return 1; | 1435 | return 1; |
1435 | odev = minor_to_mdev(resync_after); | 1436 | odev = minor_to_mdev(resync_after); |
1436 | if (!expect(odev)) | 1437 | if (!odev) |
1437 | return 1; | 1438 | return 1; |
1438 | if ((odev->state.conn >= C_SYNC_SOURCE && | 1439 | if ((odev->state.conn >= C_SYNC_SOURCE && |
1439 | odev->state.conn <= C_PAUSED_SYNC_T) || | 1440 | odev->state.conn <= C_PAUSED_SYNC_T) || |
@@ -1515,7 +1516,7 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor) | |||
1515 | 1516 | ||
1516 | if (o_minor == -1) | 1517 | if (o_minor == -1) |
1517 | return NO_ERROR; | 1518 | return NO_ERROR; |
1518 | if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) | 1519 | if (o_minor < -1 || o_minor > MINORMASK) |
1519 | return ERR_RESYNC_AFTER; | 1520 | return ERR_RESYNC_AFTER; |
1520 | 1521 | ||
1521 | /* check for loops */ | 1522 | /* check for loops */ |
@@ -1524,6 +1525,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor) | |||
1524 | if (odev == mdev) | 1525 | if (odev == mdev) |
1525 | return ERR_RESYNC_AFTER_CYCLE; | 1526 | return ERR_RESYNC_AFTER_CYCLE; |
1526 | 1527 | ||
1528 | /* You are free to depend on diskless, non-existing, | ||
1529 | * or not yet/no longer existing minors. | ||
1530 | * We only reject dependency loops. | ||
1531 | * We cannot follow the dependency chain beyond a detached or | ||
1532 | * missing minor. | ||
1533 | */ | ||
1534 | if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS) | ||
1535 | return NO_ERROR; | ||
1536 | |||
1527 | rcu_read_lock(); | 1537 | rcu_read_lock(); |
1528 | resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; | 1538 | resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; |
1529 | rcu_read_unlock(); | 1539 | rcu_read_unlock(); |
@@ -1652,7 +1662,9 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) | |||
1652 | clear_bit(B_RS_H_DONE, &mdev->flags); | 1662 | clear_bit(B_RS_H_DONE, &mdev->flags); |
1653 | 1663 | ||
1654 | write_lock_irq(&global_state_lock); | 1664 | write_lock_irq(&global_state_lock); |
1655 | if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { | 1665 | /* Did some connection breakage or IO error race with us? */ |
1666 | if (mdev->state.conn < C_CONNECTED | ||
1667 | || !get_ldev_if_state(mdev, D_NEGOTIATING)) { | ||
1656 | write_unlock_irq(&global_state_lock); | 1668 | write_unlock_irq(&global_state_lock); |
1657 | mutex_unlock(mdev->state_mutex); | 1669 | mutex_unlock(mdev->state_mutex); |
1658 | return; | 1670 | return; |
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 076ae7f1b781..a56cfcd5d648 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c | |||
@@ -780,6 +780,7 @@ static const struct block_device_operations mg_disk_ops = { | |||
780 | .getgeo = mg_getgeo | 780 | .getgeo = mg_getgeo |
781 | }; | 781 | }; |
782 | 782 | ||
783 | #ifdef CONFIG_PM_SLEEP | ||
783 | static int mg_suspend(struct device *dev) | 784 | static int mg_suspend(struct device *dev) |
784 | { | 785 | { |
785 | struct mg_drv_data *prv_data = dev->platform_data; | 786 | struct mg_drv_data *prv_data = dev->platform_data; |
@@ -824,6 +825,7 @@ static int mg_resume(struct device *dev) | |||
824 | 825 | ||
825 | return 0; | 826 | return 0; |
826 | } | 827 | } |
828 | #endif | ||
827 | 829 | ||
828 | static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume); | 830 | static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume); |
829 | 831 | ||
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 32c678028e53..847107ef0cce 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c | |||
@@ -728,7 +728,10 @@ static void mtip_async_complete(struct mtip_port *port, | |||
728 | atomic_set(&port->commands[tag].active, 0); | 728 | atomic_set(&port->commands[tag].active, 0); |
729 | release_slot(port, tag); | 729 | release_slot(port, tag); |
730 | 730 | ||
731 | up(&port->cmd_slot); | 731 | if (unlikely(command->unaligned)) |
732 | up(&port->cmd_slot_unal); | ||
733 | else | ||
734 | up(&port->cmd_slot); | ||
732 | } | 735 | } |
733 | 736 | ||
734 | /* | 737 | /* |
@@ -1560,10 +1563,12 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer) | |||
1560 | } | 1563 | } |
1561 | #endif | 1564 | #endif |
1562 | 1565 | ||
1566 | #ifdef MTIP_TRIM /* Disabling TRIM support temporarily */ | ||
1563 | /* Demux ID.DRAT & ID.RZAT to determine trim support */ | 1567 | /* Demux ID.DRAT & ID.RZAT to determine trim support */ |
1564 | if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5)) | 1568 | if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5)) |
1565 | port->dd->trim_supp = true; | 1569 | port->dd->trim_supp = true; |
1566 | else | 1570 | else |
1571 | #endif | ||
1567 | port->dd->trim_supp = false; | 1572 | port->dd->trim_supp = false; |
1568 | 1573 | ||
1569 | /* Set the identify buffer as valid. */ | 1574 | /* Set the identify buffer as valid. */ |
@@ -2557,7 +2562,7 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, | |||
2557 | */ | 2562 | */ |
2558 | static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, | 2563 | static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, |
2559 | int nsect, int nents, int tag, void *callback, | 2564 | int nsect, int nents, int tag, void *callback, |
2560 | void *data, int dir) | 2565 | void *data, int dir, int unaligned) |
2561 | { | 2566 | { |
2562 | struct host_to_dev_fis *fis; | 2567 | struct host_to_dev_fis *fis; |
2563 | struct mtip_port *port = dd->port; | 2568 | struct mtip_port *port = dd->port; |
@@ -2570,6 +2575,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, | |||
2570 | 2575 | ||
2571 | command->scatter_ents = nents; | 2576 | command->scatter_ents = nents; |
2572 | 2577 | ||
2578 | command->unaligned = unaligned; | ||
2573 | /* | 2579 | /* |
2574 | * The number of retries for this command before it is | 2580 | * The number of retries for this command before it is |
2575 | * reported as a failure to the upper layers. | 2581 | * reported as a failure to the upper layers. |
@@ -2598,6 +2604,9 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, | |||
2598 | fis->res3 = 0; | 2604 | fis->res3 = 0; |
2599 | fill_command_sg(dd, command, nents); | 2605 | fill_command_sg(dd, command, nents); |
2600 | 2606 | ||
2607 | if (unaligned) | ||
2608 | fis->device |= 1 << 7; | ||
2609 | |||
2601 | /* Populate the command header */ | 2610 | /* Populate the command header */ |
2602 | command->command_header->opts = | 2611 | command->command_header->opts = |
2603 | __force_bit2int cpu_to_le32( | 2612 | __force_bit2int cpu_to_le32( |
@@ -2644,9 +2653,13 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, | |||
2644 | * return value | 2653 | * return value |
2645 | * None | 2654 | * None |
2646 | */ | 2655 | */ |
2647 | static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag) | 2656 | static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag, |
2657 | int unaligned) | ||
2648 | { | 2658 | { |
2659 | struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal : | ||
2660 | &dd->port->cmd_slot; | ||
2649 | release_slot(dd->port, tag); | 2661 | release_slot(dd->port, tag); |
2662 | up(sem); | ||
2650 | } | 2663 | } |
2651 | 2664 | ||
2652 | /* | 2665 | /* |
@@ -2661,22 +2674,25 @@ static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag) | |||
2661 | * or NULL if no command slots are available. | 2674 | * or NULL if no command slots are available. |
2662 | */ | 2675 | */ |
2663 | static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd, | 2676 | static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd, |
2664 | int *tag) | 2677 | int *tag, int unaligned) |
2665 | { | 2678 | { |
2679 | struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal : | ||
2680 | &dd->port->cmd_slot; | ||
2681 | |||
2666 | /* | 2682 | /* |
2667 | * It is possible that, even with this semaphore, a thread | 2683 | * It is possible that, even with this semaphore, a thread |
2668 | * may think that no command slots are available. Therefore, we | 2684 | * may think that no command slots are available. Therefore, we |
2669 | * need to make an attempt to get_slot(). | 2685 | * need to make an attempt to get_slot(). |
2670 | */ | 2686 | */ |
2671 | down(&dd->port->cmd_slot); | 2687 | down(sem); |
2672 | *tag = get_slot(dd->port); | 2688 | *tag = get_slot(dd->port); |
2673 | 2689 | ||
2674 | if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { | 2690 | if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { |
2675 | up(&dd->port->cmd_slot); | 2691 | up(sem); |
2676 | return NULL; | 2692 | return NULL; |
2677 | } | 2693 | } |
2678 | if (unlikely(*tag < 0)) { | 2694 | if (unlikely(*tag < 0)) { |
2679 | up(&dd->port->cmd_slot); | 2695 | up(sem); |
2680 | return NULL; | 2696 | return NULL; |
2681 | } | 2697 | } |
2682 | 2698 | ||
@@ -3010,6 +3026,11 @@ static inline void hba_setup(struct driver_data *dd) | |||
3010 | dd->mmio + HOST_HSORG); | 3026 | dd->mmio + HOST_HSORG); |
3011 | } | 3027 | } |
3012 | 3028 | ||
3029 | static int mtip_device_unaligned_constrained(struct driver_data *dd) | ||
3030 | { | ||
3031 | return (dd->pdev->device == P420M_DEVICE_ID ? 1 : 0); | ||
3032 | } | ||
3033 | |||
3013 | /* | 3034 | /* |
3014 | * Detect the details of the product, and store anything needed | 3035 | * Detect the details of the product, and store anything needed |
3015 | * into the driver data structure. This includes product type and | 3036 | * into the driver data structure. This includes product type and |
@@ -3232,8 +3253,15 @@ static int mtip_hw_init(struct driver_data *dd) | |||
3232 | for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) | 3253 | for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) |
3233 | dd->work[i].port = dd->port; | 3254 | dd->work[i].port = dd->port; |
3234 | 3255 | ||
3256 | /* Enable unaligned IO constraints for some devices */ | ||
3257 | if (mtip_device_unaligned_constrained(dd)) | ||
3258 | dd->unal_qdepth = MTIP_MAX_UNALIGNED_SLOTS; | ||
3259 | else | ||
3260 | dd->unal_qdepth = 0; | ||
3261 | |||
3235 | /* Counting semaphore to track command slot usage */ | 3262 | /* Counting semaphore to track command slot usage */ |
3236 | sema_init(&dd->port->cmd_slot, num_command_slots - 1); | 3263 | sema_init(&dd->port->cmd_slot, num_command_slots - 1 - dd->unal_qdepth); |
3264 | sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth); | ||
3237 | 3265 | ||
3238 | /* Spinlock to prevent concurrent issue */ | 3266 | /* Spinlock to prevent concurrent issue */ |
3239 | for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) | 3267 | for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) |
@@ -3836,7 +3864,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) | |||
3836 | struct scatterlist *sg; | 3864 | struct scatterlist *sg; |
3837 | struct bio_vec *bvec; | 3865 | struct bio_vec *bvec; |
3838 | int nents = 0; | 3866 | int nents = 0; |
3839 | int tag = 0; | 3867 | int tag = 0, unaligned = 0; |
3840 | 3868 | ||
3841 | if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { | 3869 | if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { |
3842 | if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, | 3870 | if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, |
@@ -3872,7 +3900,15 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) | |||
3872 | return; | 3900 | return; |
3873 | } | 3901 | } |
3874 | 3902 | ||
3875 | sg = mtip_hw_get_scatterlist(dd, &tag); | 3903 | if (bio_data_dir(bio) == WRITE && bio_sectors(bio) <= 64 && |
3904 | dd->unal_qdepth) { | ||
3905 | if (bio->bi_sector % 8 != 0) /* Unaligned on 4k boundaries */ | ||
3906 | unaligned = 1; | ||
3907 | else if (bio_sectors(bio) % 8 != 0) /* Aligned but not 4k/8k */ | ||
3908 | unaligned = 1; | ||
3909 | } | ||
3910 | |||
3911 | sg = mtip_hw_get_scatterlist(dd, &tag, unaligned); | ||
3876 | if (likely(sg != NULL)) { | 3912 | if (likely(sg != NULL)) { |
3877 | blk_queue_bounce(queue, &bio); | 3913 | blk_queue_bounce(queue, &bio); |
3878 | 3914 | ||
@@ -3880,7 +3916,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) | |||
3880 | dev_warn(&dd->pdev->dev, | 3916 | dev_warn(&dd->pdev->dev, |
3881 | "Maximum number of SGL entries exceeded\n"); | 3917 | "Maximum number of SGL entries exceeded\n"); |
3882 | bio_io_error(bio); | 3918 | bio_io_error(bio); |
3883 | mtip_hw_release_scatterlist(dd, tag); | 3919 | mtip_hw_release_scatterlist(dd, tag, unaligned); |
3884 | return; | 3920 | return; |
3885 | } | 3921 | } |
3886 | 3922 | ||
@@ -3900,7 +3936,8 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) | |||
3900 | tag, | 3936 | tag, |
3901 | bio_endio, | 3937 | bio_endio, |
3902 | bio, | 3938 | bio, |
3903 | bio_data_dir(bio)); | 3939 | bio_data_dir(bio), |
3940 | unaligned); | ||
3904 | } else | 3941 | } else |
3905 | bio_io_error(bio); | 3942 | bio_io_error(bio); |
3906 | } | 3943 | } |
@@ -4156,26 +4193,24 @@ static int mtip_block_remove(struct driver_data *dd) | |||
4156 | */ | 4193 | */ |
4157 | static int mtip_block_shutdown(struct driver_data *dd) | 4194 | static int mtip_block_shutdown(struct driver_data *dd) |
4158 | { | 4195 | { |
4159 | dev_info(&dd->pdev->dev, | ||
4160 | "Shutting down %s ...\n", dd->disk->disk_name); | ||
4161 | |||
4162 | /* Delete our gendisk structure, and cleanup the blk queue. */ | 4196 | /* Delete our gendisk structure, and cleanup the blk queue. */ |
4163 | if (dd->disk) { | 4197 | if (dd->disk) { |
4164 | if (dd->disk->queue) | 4198 | dev_info(&dd->pdev->dev, |
4199 | "Shutting down %s ...\n", dd->disk->disk_name); | ||
4200 | |||
4201 | if (dd->disk->queue) { | ||
4165 | del_gendisk(dd->disk); | 4202 | del_gendisk(dd->disk); |
4166 | else | 4203 | blk_cleanup_queue(dd->queue); |
4204 | } else | ||
4167 | put_disk(dd->disk); | 4205 | put_disk(dd->disk); |
4206 | dd->disk = NULL; | ||
4207 | dd->queue = NULL; | ||
4168 | } | 4208 | } |
4169 | 4209 | ||
4170 | |||
4171 | spin_lock(&rssd_index_lock); | 4210 | spin_lock(&rssd_index_lock); |
4172 | ida_remove(&rssd_index_ida, dd->index); | 4211 | ida_remove(&rssd_index_ida, dd->index); |
4173 | spin_unlock(&rssd_index_lock); | 4212 | spin_unlock(&rssd_index_lock); |
4174 | 4213 | ||
4175 | blk_cleanup_queue(dd->queue); | ||
4176 | dd->disk = NULL; | ||
4177 | dd->queue = NULL; | ||
4178 | |||
4179 | mtip_hw_shutdown(dd); | 4214 | mtip_hw_shutdown(dd); |
4180 | return 0; | 4215 | return 0; |
4181 | } | 4216 | } |
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 8e8334c9dd0f..3bb8a295fbe4 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h | |||
@@ -52,6 +52,9 @@ | |||
52 | #define MTIP_FTL_REBUILD_MAGIC 0xED51 | 52 | #define MTIP_FTL_REBUILD_MAGIC 0xED51 |
53 | #define MTIP_FTL_REBUILD_TIMEOUT_MS 2400000 | 53 | #define MTIP_FTL_REBUILD_TIMEOUT_MS 2400000 |
54 | 54 | ||
55 | /* unaligned IO handling */ | ||
56 | #define MTIP_MAX_UNALIGNED_SLOTS 8 | ||
57 | |||
55 | /* Macro to extract the tag bit number from a tag value. */ | 58 | /* Macro to extract the tag bit number from a tag value. */ |
56 | #define MTIP_TAG_BIT(tag) (tag & 0x1F) | 59 | #define MTIP_TAG_BIT(tag) (tag & 0x1F) |
57 | 60 | ||
@@ -333,6 +336,8 @@ struct mtip_cmd { | |||
333 | 336 | ||
334 | int scatter_ents; /* Number of scatter list entries used */ | 337 | int scatter_ents; /* Number of scatter list entries used */ |
335 | 338 | ||
339 | int unaligned; /* command is unaligned on 4k boundary */ | ||
340 | |||
336 | struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */ | 341 | struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */ |
337 | 342 | ||
338 | int retries; /* The number of retries left for this command. */ | 343 | int retries; /* The number of retries left for this command. */ |
@@ -452,6 +457,10 @@ struct mtip_port { | |||
452 | * command slots available. | 457 | * command slots available. |
453 | */ | 458 | */ |
454 | struct semaphore cmd_slot; | 459 | struct semaphore cmd_slot; |
460 | |||
461 | /* Semaphore to control queue depth of unaligned IOs */ | ||
462 | struct semaphore cmd_slot_unal; | ||
463 | |||
455 | /* Spinlock for working around command-issue bug. */ | 464 | /* Spinlock for working around command-issue bug. */ |
456 | spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS]; | 465 | spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS]; |
457 | }; | 466 | }; |
@@ -502,6 +511,8 @@ struct driver_data { | |||
502 | 511 | ||
503 | int isr_binding; | 512 | int isr_binding; |
504 | 513 | ||
514 | int unal_qdepth; /* qdepth of unaligned IO queue */ | ||
515 | |||
505 | struct list_head online_list; /* linkage for online list */ | 516 | struct list_head online_list; /* linkage for online list */ |
506 | 517 | ||
507 | struct list_head remove_list; /* linkage for removing list */ | 518 | struct list_head remove_list; /* linkage for removing list */ |
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 4d8d90b4fe78..3bfc8f1da9fe 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -174,6 +174,8 @@ config MD_FAULTY | |||
174 | 174 | ||
175 | In unsure, say N. | 175 | In unsure, say N. |
176 | 176 | ||
177 | source "drivers/md/bcache/Kconfig" | ||
178 | |||
177 | config BLK_DEV_DM | 179 | config BLK_DEV_DM |
178 | tristate "Device mapper support" | 180 | tristate "Device mapper support" |
179 | ---help--- | 181 | ---help--- |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 7ceeaefc0e95..1439fd4ad9b1 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -29,6 +29,7 @@ obj-$(CONFIG_MD_RAID10) += raid10.o | |||
29 | obj-$(CONFIG_MD_RAID456) += raid456.o | 29 | obj-$(CONFIG_MD_RAID456) += raid456.o |
30 | obj-$(CONFIG_MD_MULTIPATH) += multipath.o | 30 | obj-$(CONFIG_MD_MULTIPATH) += multipath.o |
31 | obj-$(CONFIG_MD_FAULTY) += faulty.o | 31 | obj-$(CONFIG_MD_FAULTY) += faulty.o |
32 | obj-$(CONFIG_BCACHE) += bcache/ | ||
32 | obj-$(CONFIG_BLK_DEV_MD) += md-mod.o | 33 | obj-$(CONFIG_BLK_DEV_MD) += md-mod.o |
33 | obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o | 34 | obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o |
34 | obj-$(CONFIG_DM_BUFIO) += dm-bufio.o | 35 | obj-$(CONFIG_DM_BUFIO) += dm-bufio.o |
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig new file mode 100644 index 000000000000..05c220d05e23 --- /dev/null +++ b/drivers/md/bcache/Kconfig | |||
@@ -0,0 +1,42 @@ | |||
1 | |||
2 | config BCACHE | ||
3 | tristate "Block device as cache" | ||
4 | select CLOSURES | ||
5 | ---help--- | ||
6 | Allows a block device to be used as cache for other devices; uses | ||
7 | a btree for indexing and the layout is optimized for SSDs. | ||
8 | |||
9 | See Documentation/bcache.txt for details. | ||
10 | |||
11 | config BCACHE_DEBUG | ||
12 | bool "Bcache debugging" | ||
13 | depends on BCACHE | ||
14 | ---help--- | ||
15 | Don't select this option unless you're a developer | ||
16 | |||
17 | Enables extra debugging tools (primarily a fuzz tester) | ||
18 | |||
19 | config BCACHE_EDEBUG | ||
20 | bool "Extended runtime checks" | ||
21 | depends on BCACHE | ||
22 | ---help--- | ||
23 | Don't select this option unless you're a developer | ||
24 | |||
25 | Enables extra runtime checks which significantly affect performance | ||
26 | |||
27 | config BCACHE_CLOSURES_DEBUG | ||
28 | bool "Debug closures" | ||
29 | depends on BCACHE | ||
30 | select DEBUG_FS | ||
31 | ---help--- | ||
32 | Keeps all active closures in a linked list and provides a debugfs | ||
33 | interface to list them, which makes it possible to see asynchronous | ||
34 | operations that get stuck. | ||
35 | |||
36 | # cgroup code needs to be updated: | ||
37 | # | ||
38 | #config CGROUP_BCACHE | ||
39 | # bool "Cgroup controls for bcache" | ||
40 | # depends on BCACHE && BLK_CGROUP | ||
41 | # ---help--- | ||
42 | # TODO | ||
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile new file mode 100644 index 000000000000..0e9c82523be6 --- /dev/null +++ b/drivers/md/bcache/Makefile | |||
@@ -0,0 +1,7 @@ | |||
1 | |||
2 | obj-$(CONFIG_BCACHE) += bcache.o | ||
3 | |||
4 | bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\ | ||
5 | movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o | ||
6 | |||
7 | CFLAGS_request.o += -Iblock | ||
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c new file mode 100644 index 000000000000..048f2947e08b --- /dev/null +++ b/drivers/md/bcache/alloc.c | |||
@@ -0,0 +1,599 @@ | |||
1 | /* | ||
2 | * Primary bucket allocation code | ||
3 | * | ||
4 | * Copyright 2012 Google, Inc. | ||
5 | * | ||
6 | * Allocation in bcache is done in terms of buckets: | ||
7 | * | ||
8 | * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in | ||
9 | * btree pointers - they must match for the pointer to be considered valid. | ||
10 | * | ||
11 | * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a | ||
12 | * bucket simply by incrementing its gen. | ||
13 | * | ||
14 | * The gens (along with the priorities; it's really the gens are important but | ||
15 | * the code is named as if it's the priorities) are written in an arbitrary list | ||
16 | * of buckets on disk, with a pointer to them in the journal header. | ||
17 | * | ||
18 | * When we invalidate a bucket, we have to write its new gen to disk and wait | ||
19 | * for that write to complete before we use it - otherwise after a crash we | ||
20 | * could have pointers that appeared to be good but pointed to data that had | ||
21 | * been overwritten. | ||
22 | * | ||
23 | * Since the gens and priorities are all stored contiguously on disk, we can | ||
24 | * batch this up: We fill up the free_inc list with freshly invalidated buckets, | ||
25 | * call prio_write(), and when prio_write() finishes we pull buckets off the | ||
26 | * free_inc list and optionally discard them. | ||
27 | * | ||
28 | * free_inc isn't the only freelist - if it was, we'd often to sleep while | ||
29 | * priorities and gens were being written before we could allocate. c->free is a | ||
30 | * smaller freelist, and buckets on that list are always ready to be used. | ||
31 | * | ||
32 | * If we've got discards enabled, that happens when a bucket moves from the | ||
33 | * free_inc list to the free list. | ||
34 | * | ||
35 | * There is another freelist, because sometimes we have buckets that we know | ||
36 | * have nothing pointing into them - these we can reuse without waiting for | ||
37 | * priorities to be rewritten. These come from freed btree nodes and buckets | ||
38 | * that garbage collection discovered no longer had valid keys pointing into | ||
39 | * them (because they were overwritten). That's the unused list - buckets on the | ||
40 | * unused list move to the free list, optionally being discarded in the process. | ||
41 | * | ||
42 | * It's also important to ensure that gens don't wrap around - with respect to | ||
43 | * either the oldest gen in the btree or the gen on disk. This is quite | ||
44 | * difficult to do in practice, but we explicitly guard against it anyways - if | ||
45 | * a bucket is in danger of wrapping around we simply skip invalidating it that | ||
46 | * time around, and we garbage collect or rewrite the priorities sooner than we | ||
47 | * would have otherwise. | ||
48 | * | ||
49 | * bch_bucket_alloc() allocates a single bucket from a specific cache. | ||
50 | * | ||
51 | * bch_bucket_alloc_set() allocates one or more buckets from different caches | ||
52 | * out of a cache set. | ||
53 | * | ||
54 | * free_some_buckets() drives all the processes described above. It's called | ||
55 | * from bch_bucket_alloc() and a few other places that need to make sure free | ||
56 | * buckets are ready. | ||
57 | * | ||
58 | * invalidate_buckets_(lru|fifo)() find buckets that are available to be | ||
59 | * invalidated, and then invalidate them and stick them on the free_inc list - | ||
60 | * in either lru or fifo order. | ||
61 | */ | ||
62 | |||
63 | #include "bcache.h" | ||
64 | #include "btree.h" | ||
65 | |||
66 | #include <linux/random.h> | ||
67 | |||
68 | #define MAX_IN_FLIGHT_DISCARDS 8U | ||
69 | |||
70 | /* Bucket heap / gen */ | ||
71 | |||
72 | uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) | ||
73 | { | ||
74 | uint8_t ret = ++b->gen; | ||
75 | |||
76 | ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); | ||
77 | WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); | ||
78 | |||
79 | if (CACHE_SYNC(&ca->set->sb)) { | ||
80 | ca->need_save_prio = max(ca->need_save_prio, | ||
81 | bucket_disk_gen(b)); | ||
82 | WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX); | ||
83 | } | ||
84 | |||
85 | return ret; | ||
86 | } | ||
87 | |||
88 | void bch_rescale_priorities(struct cache_set *c, int sectors) | ||
89 | { | ||
90 | struct cache *ca; | ||
91 | struct bucket *b; | ||
92 | unsigned next = c->nbuckets * c->sb.bucket_size / 1024; | ||
93 | unsigned i; | ||
94 | int r; | ||
95 | |||
96 | atomic_sub(sectors, &c->rescale); | ||
97 | |||
98 | do { | ||
99 | r = atomic_read(&c->rescale); | ||
100 | |||
101 | if (r >= 0) | ||
102 | return; | ||
103 | } while (atomic_cmpxchg(&c->rescale, r, r + next) != r); | ||
104 | |||
105 | mutex_lock(&c->bucket_lock); | ||
106 | |||
107 | c->min_prio = USHRT_MAX; | ||
108 | |||
109 | for_each_cache(ca, c, i) | ||
110 | for_each_bucket(b, ca) | ||
111 | if (b->prio && | ||
112 | b->prio != BTREE_PRIO && | ||
113 | !atomic_read(&b->pin)) { | ||
114 | b->prio--; | ||
115 | c->min_prio = min(c->min_prio, b->prio); | ||
116 | } | ||
117 | |||
118 | mutex_unlock(&c->bucket_lock); | ||
119 | } | ||
120 | |||
121 | /* Discard/TRIM */ | ||
122 | |||
123 | struct discard { | ||
124 | struct list_head list; | ||
125 | struct work_struct work; | ||
126 | struct cache *ca; | ||
127 | long bucket; | ||
128 | |||
129 | struct bio bio; | ||
130 | struct bio_vec bv; | ||
131 | }; | ||
132 | |||
133 | static void discard_finish(struct work_struct *w) | ||
134 | { | ||
135 | struct discard *d = container_of(w, struct discard, work); | ||
136 | struct cache *ca = d->ca; | ||
137 | char buf[BDEVNAME_SIZE]; | ||
138 | |||
139 | if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) { | ||
140 | pr_notice("discard error on %s, disabling", | ||
141 | bdevname(ca->bdev, buf)); | ||
142 | d->ca->discard = 0; | ||
143 | } | ||
144 | |||
145 | mutex_lock(&ca->set->bucket_lock); | ||
146 | |||
147 | fifo_push(&ca->free, d->bucket); | ||
148 | list_add(&d->list, &ca->discards); | ||
149 | atomic_dec(&ca->discards_in_flight); | ||
150 | |||
151 | mutex_unlock(&ca->set->bucket_lock); | ||
152 | |||
153 | closure_wake_up(&ca->set->bucket_wait); | ||
154 | wake_up(&ca->set->alloc_wait); | ||
155 | |||
156 | closure_put(&ca->set->cl); | ||
157 | } | ||
158 | |||
159 | static void discard_endio(struct bio *bio, int error) | ||
160 | { | ||
161 | struct discard *d = container_of(bio, struct discard, bio); | ||
162 | schedule_work(&d->work); | ||
163 | } | ||
164 | |||
165 | static void do_discard(struct cache *ca, long bucket) | ||
166 | { | ||
167 | struct discard *d = list_first_entry(&ca->discards, | ||
168 | struct discard, list); | ||
169 | |||
170 | list_del(&d->list); | ||
171 | d->bucket = bucket; | ||
172 | |||
173 | atomic_inc(&ca->discards_in_flight); | ||
174 | closure_get(&ca->set->cl); | ||
175 | |||
176 | bio_init(&d->bio); | ||
177 | |||
178 | d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket); | ||
179 | d->bio.bi_bdev = ca->bdev; | ||
180 | d->bio.bi_rw = REQ_WRITE|REQ_DISCARD; | ||
181 | d->bio.bi_max_vecs = 1; | ||
182 | d->bio.bi_io_vec = d->bio.bi_inline_vecs; | ||
183 | d->bio.bi_size = bucket_bytes(ca); | ||
184 | d->bio.bi_end_io = discard_endio; | ||
185 | bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | ||
186 | |||
187 | submit_bio(0, &d->bio); | ||
188 | } | ||
189 | |||
190 | /* Allocation */ | ||
191 | |||
192 | static inline bool can_inc_bucket_gen(struct bucket *b) | ||
193 | { | ||
194 | return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX && | ||
195 | bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX; | ||
196 | } | ||
197 | |||
198 | bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) | ||
199 | { | ||
200 | BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); | ||
201 | |||
202 | if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] && | ||
203 | CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) | ||
204 | return false; | ||
205 | |||
206 | b->prio = 0; | ||
207 | |||
208 | if (can_inc_bucket_gen(b) && | ||
209 | fifo_push(&ca->unused, b - ca->buckets)) { | ||
210 | atomic_inc(&b->pin); | ||
211 | return true; | ||
212 | } | ||
213 | |||
214 | return false; | ||
215 | } | ||
216 | |||
217 | static bool can_invalidate_bucket(struct cache *ca, struct bucket *b) | ||
218 | { | ||
219 | return GC_MARK(b) == GC_MARK_RECLAIMABLE && | ||
220 | !atomic_read(&b->pin) && | ||
221 | can_inc_bucket_gen(b); | ||
222 | } | ||
223 | |||
224 | static void invalidate_one_bucket(struct cache *ca, struct bucket *b) | ||
225 | { | ||
226 | bch_inc_gen(ca, b); | ||
227 | b->prio = INITIAL_PRIO; | ||
228 | atomic_inc(&b->pin); | ||
229 | fifo_push(&ca->free_inc, b - ca->buckets); | ||
230 | } | ||
231 | |||
232 | #define bucket_prio(b) \ | ||
233 | (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b)) | ||
234 | |||
235 | #define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) | ||
236 | #define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) | ||
237 | |||
238 | static void invalidate_buckets_lru(struct cache *ca) | ||
239 | { | ||
240 | struct bucket *b; | ||
241 | ssize_t i; | ||
242 | |||
243 | ca->heap.used = 0; | ||
244 | |||
245 | for_each_bucket(b, ca) { | ||
246 | /* | ||
247 | * If we fill up the unused list, if we then return before | ||
248 | * adding anything to the free_inc list we'll skip writing | ||
249 | * prios/gens and just go back to allocating from the unused | ||
250 | * list: | ||
251 | */ | ||
252 | if (fifo_full(&ca->unused)) | ||
253 | return; | ||
254 | |||
255 | if (!can_invalidate_bucket(ca, b)) | ||
256 | continue; | ||
257 | |||
258 | if (!GC_SECTORS_USED(b) && | ||
259 | bch_bucket_add_unused(ca, b)) | ||
260 | continue; | ||
261 | |||
262 | if (!heap_full(&ca->heap)) | ||
263 | heap_add(&ca->heap, b, bucket_max_cmp); | ||
264 | else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { | ||
265 | ca->heap.data[0] = b; | ||
266 | heap_sift(&ca->heap, 0, bucket_max_cmp); | ||
267 | } | ||
268 | } | ||
269 | |||
270 | for (i = ca->heap.used / 2 - 1; i >= 0; --i) | ||
271 | heap_sift(&ca->heap, i, bucket_min_cmp); | ||
272 | |||
273 | while (!fifo_full(&ca->free_inc)) { | ||
274 | if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { | ||
275 | /* | ||
276 | * We don't want to be calling invalidate_buckets() | ||
277 | * multiple times when it can't do anything | ||
278 | */ | ||
279 | ca->invalidate_needs_gc = 1; | ||
280 | bch_queue_gc(ca->set); | ||
281 | return; | ||
282 | } | ||
283 | |||
284 | invalidate_one_bucket(ca, b); | ||
285 | } | ||
286 | } | ||
287 | |||
288 | static void invalidate_buckets_fifo(struct cache *ca) | ||
289 | { | ||
290 | struct bucket *b; | ||
291 | size_t checked = 0; | ||
292 | |||
293 | while (!fifo_full(&ca->free_inc)) { | ||
294 | if (ca->fifo_last_bucket < ca->sb.first_bucket || | ||
295 | ca->fifo_last_bucket >= ca->sb.nbuckets) | ||
296 | ca->fifo_last_bucket = ca->sb.first_bucket; | ||
297 | |||
298 | b = ca->buckets + ca->fifo_last_bucket++; | ||
299 | |||
300 | if (can_invalidate_bucket(ca, b)) | ||
301 | invalidate_one_bucket(ca, b); | ||
302 | |||
303 | if (++checked >= ca->sb.nbuckets) { | ||
304 | ca->invalidate_needs_gc = 1; | ||
305 | bch_queue_gc(ca->set); | ||
306 | return; | ||
307 | } | ||
308 | } | ||
309 | } | ||
310 | |||
311 | static void invalidate_buckets_random(struct cache *ca) | ||
312 | { | ||
313 | struct bucket *b; | ||
314 | size_t checked = 0; | ||
315 | |||
316 | while (!fifo_full(&ca->free_inc)) { | ||
317 | size_t n; | ||
318 | get_random_bytes(&n, sizeof(n)); | ||
319 | |||
320 | n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket); | ||
321 | n += ca->sb.first_bucket; | ||
322 | |||
323 | b = ca->buckets + n; | ||
324 | |||
325 | if (can_invalidate_bucket(ca, b)) | ||
326 | invalidate_one_bucket(ca, b); | ||
327 | |||
328 | if (++checked >= ca->sb.nbuckets / 2) { | ||
329 | ca->invalidate_needs_gc = 1; | ||
330 | bch_queue_gc(ca->set); | ||
331 | return; | ||
332 | } | ||
333 | } | ||
334 | } | ||
335 | |||
336 | static void invalidate_buckets(struct cache *ca) | ||
337 | { | ||
338 | if (ca->invalidate_needs_gc) | ||
339 | return; | ||
340 | |||
341 | switch (CACHE_REPLACEMENT(&ca->sb)) { | ||
342 | case CACHE_REPLACEMENT_LRU: | ||
343 | invalidate_buckets_lru(ca); | ||
344 | break; | ||
345 | case CACHE_REPLACEMENT_FIFO: | ||
346 | invalidate_buckets_fifo(ca); | ||
347 | break; | ||
348 | case CACHE_REPLACEMENT_RANDOM: | ||
349 | invalidate_buckets_random(ca); | ||
350 | break; | ||
351 | } | ||
352 | |||
353 | pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu", | ||
354 | fifo_used(&ca->free), ca->free.size, | ||
355 | fifo_used(&ca->free_inc), ca->free_inc.size, | ||
356 | fifo_used(&ca->unused), ca->unused.size); | ||
357 | } | ||
358 | |||
359 | #define allocator_wait(ca, cond) \ | ||
360 | do { \ | ||
361 | DEFINE_WAIT(__wait); \ | ||
362 | \ | ||
363 | while (1) { \ | ||
364 | prepare_to_wait(&ca->set->alloc_wait, \ | ||
365 | &__wait, TASK_INTERRUPTIBLE); \ | ||
366 | if (cond) \ | ||
367 | break; \ | ||
368 | \ | ||
369 | mutex_unlock(&(ca)->set->bucket_lock); \ | ||
370 | if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \ | ||
371 | finish_wait(&ca->set->alloc_wait, &__wait); \ | ||
372 | closure_return(cl); \ | ||
373 | } \ | ||
374 | \ | ||
375 | schedule(); \ | ||
376 | mutex_lock(&(ca)->set->bucket_lock); \ | ||
377 | } \ | ||
378 | \ | ||
379 | finish_wait(&ca->set->alloc_wait, &__wait); \ | ||
380 | } while (0) | ||
381 | |||
382 | void bch_allocator_thread(struct closure *cl) | ||
383 | { | ||
384 | struct cache *ca = container_of(cl, struct cache, alloc); | ||
385 | |||
386 | mutex_lock(&ca->set->bucket_lock); | ||
387 | |||
388 | while (1) { | ||
389 | /* | ||
390 | * First, we pull buckets off of the unused and free_inc lists, | ||
391 | * possibly issue discards to them, then we add the bucket to | ||
392 | * the free list: | ||
393 | */ | ||
394 | while (1) { | ||
395 | long bucket; | ||
396 | |||
397 | if ((!atomic_read(&ca->set->prio_blocked) || | ||
398 | !CACHE_SYNC(&ca->set->sb)) && | ||
399 | !fifo_empty(&ca->unused)) | ||
400 | fifo_pop(&ca->unused, bucket); | ||
401 | else if (!fifo_empty(&ca->free_inc)) | ||
402 | fifo_pop(&ca->free_inc, bucket); | ||
403 | else | ||
404 | break; | ||
405 | |||
406 | allocator_wait(ca, (int) fifo_free(&ca->free) > | ||
407 | atomic_read(&ca->discards_in_flight)); | ||
408 | |||
409 | if (ca->discard) { | ||
410 | allocator_wait(ca, !list_empty(&ca->discards)); | ||
411 | do_discard(ca, bucket); | ||
412 | } else { | ||
413 | fifo_push(&ca->free, bucket); | ||
414 | closure_wake_up(&ca->set->bucket_wait); | ||
415 | } | ||
416 | } | ||
417 | |||
418 | /* | ||
419 | * We've run out of free buckets, we need to find some buckets | ||
420 | * we can invalidate. First, invalidate them in memory and add | ||
421 | * them to the free_inc list: | ||
422 | */ | ||
423 | |||
424 | allocator_wait(ca, ca->set->gc_mark_valid && | ||
425 | (ca->need_save_prio > 64 || | ||
426 | !ca->invalidate_needs_gc)); | ||
427 | invalidate_buckets(ca); | ||
428 | |||
429 | /* | ||
430 | * Now, we write their new gens to disk so we can start writing | ||
431 | * new stuff to them: | ||
432 | */ | ||
433 | allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); | ||
434 | if (CACHE_SYNC(&ca->set->sb) && | ||
435 | (!fifo_empty(&ca->free_inc) || | ||
436 | ca->need_save_prio > 64)) | ||
437 | bch_prio_write(ca); | ||
438 | } | ||
439 | } | ||
440 | |||
441 | long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) | ||
442 | { | ||
443 | long r = -1; | ||
444 | again: | ||
445 | wake_up(&ca->set->alloc_wait); | ||
446 | |||
447 | if (fifo_used(&ca->free) > ca->watermark[watermark] && | ||
448 | fifo_pop(&ca->free, r)) { | ||
449 | struct bucket *b = ca->buckets + r; | ||
450 | #ifdef CONFIG_BCACHE_EDEBUG | ||
451 | size_t iter; | ||
452 | long i; | ||
453 | |||
454 | for (iter = 0; iter < prio_buckets(ca) * 2; iter++) | ||
455 | BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); | ||
456 | |||
457 | fifo_for_each(i, &ca->free, iter) | ||
458 | BUG_ON(i == r); | ||
459 | fifo_for_each(i, &ca->free_inc, iter) | ||
460 | BUG_ON(i == r); | ||
461 | fifo_for_each(i, &ca->unused, iter) | ||
462 | BUG_ON(i == r); | ||
463 | #endif | ||
464 | BUG_ON(atomic_read(&b->pin) != 1); | ||
465 | |||
466 | SET_GC_SECTORS_USED(b, ca->sb.bucket_size); | ||
467 | |||
468 | if (watermark <= WATERMARK_METADATA) { | ||
469 | SET_GC_MARK(b, GC_MARK_METADATA); | ||
470 | b->prio = BTREE_PRIO; | ||
471 | } else { | ||
472 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | ||
473 | b->prio = INITIAL_PRIO; | ||
474 | } | ||
475 | |||
476 | return r; | ||
477 | } | ||
478 | |||
479 | pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu", | ||
480 | atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free), | ||
481 | fifo_used(&ca->free_inc), fifo_used(&ca->unused)); | ||
482 | |||
483 | if (cl) { | ||
484 | closure_wait(&ca->set->bucket_wait, cl); | ||
485 | |||
486 | if (closure_blocking(cl)) { | ||
487 | mutex_unlock(&ca->set->bucket_lock); | ||
488 | closure_sync(cl); | ||
489 | mutex_lock(&ca->set->bucket_lock); | ||
490 | goto again; | ||
491 | } | ||
492 | } | ||
493 | |||
494 | return -1; | ||
495 | } | ||
496 | |||
497 | void bch_bucket_free(struct cache_set *c, struct bkey *k) | ||
498 | { | ||
499 | unsigned i; | ||
500 | |||
501 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
502 | struct bucket *b = PTR_BUCKET(c, k, i); | ||
503 | |||
504 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | ||
505 | SET_GC_SECTORS_USED(b, 0); | ||
506 | bch_bucket_add_unused(PTR_CACHE(c, k, i), b); | ||
507 | } | ||
508 | } | ||
509 | |||
510 | int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | ||
511 | struct bkey *k, int n, struct closure *cl) | ||
512 | { | ||
513 | int i; | ||
514 | |||
515 | lockdep_assert_held(&c->bucket_lock); | ||
516 | BUG_ON(!n || n > c->caches_loaded || n > 8); | ||
517 | |||
518 | bkey_init(k); | ||
519 | |||
520 | /* sort by free space/prio of oldest data in caches */ | ||
521 | |||
522 | for (i = 0; i < n; i++) { | ||
523 | struct cache *ca = c->cache_by_alloc[i]; | ||
524 | long b = bch_bucket_alloc(ca, watermark, cl); | ||
525 | |||
526 | if (b == -1) | ||
527 | goto err; | ||
528 | |||
529 | k->ptr[i] = PTR(ca->buckets[b].gen, | ||
530 | bucket_to_sector(c, b), | ||
531 | ca->sb.nr_this_dev); | ||
532 | |||
533 | SET_KEY_PTRS(k, i + 1); | ||
534 | } | ||
535 | |||
536 | return 0; | ||
537 | err: | ||
538 | bch_bucket_free(c, k); | ||
539 | __bkey_put(c, k); | ||
540 | return -1; | ||
541 | } | ||
542 | |||
543 | int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | ||
544 | struct bkey *k, int n, struct closure *cl) | ||
545 | { | ||
546 | int ret; | ||
547 | mutex_lock(&c->bucket_lock); | ||
548 | ret = __bch_bucket_alloc_set(c, watermark, k, n, cl); | ||
549 | mutex_unlock(&c->bucket_lock); | ||
550 | return ret; | ||
551 | } | ||
552 | |||
553 | /* Init */ | ||
554 | |||
555 | void bch_cache_allocator_exit(struct cache *ca) | ||
556 | { | ||
557 | struct discard *d; | ||
558 | |||
559 | while (!list_empty(&ca->discards)) { | ||
560 | d = list_first_entry(&ca->discards, struct discard, list); | ||
561 | cancel_work_sync(&d->work); | ||
562 | list_del(&d->list); | ||
563 | kfree(d); | ||
564 | } | ||
565 | } | ||
566 | |||
567 | int bch_cache_allocator_init(struct cache *ca) | ||
568 | { | ||
569 | unsigned i; | ||
570 | |||
571 | /* | ||
572 | * Reserve: | ||
573 | * Prio/gen writes first | ||
574 | * Then 8 for btree allocations | ||
575 | * Then half for the moving garbage collector | ||
576 | */ | ||
577 | |||
578 | ca->watermark[WATERMARK_PRIO] = 0; | ||
579 | |||
580 | ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); | ||
581 | |||
582 | ca->watermark[WATERMARK_MOVINGGC] = 8 + | ||
583 | ca->watermark[WATERMARK_METADATA]; | ||
584 | |||
585 | ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + | ||
586 | ca->watermark[WATERMARK_MOVINGGC]; | ||
587 | |||
588 | for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) { | ||
589 | struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL); | ||
590 | if (!d) | ||
591 | return -ENOMEM; | ||
592 | |||
593 | d->ca = ca; | ||
594 | INIT_WORK(&d->work, discard_finish); | ||
595 | list_add(&d->list, &ca->discards); | ||
596 | } | ||
597 | |||
598 | return 0; | ||
599 | } | ||
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h new file mode 100644 index 000000000000..340146d7c17f --- /dev/null +++ b/drivers/md/bcache/bcache.h | |||
@@ -0,0 +1,1259 @@ | |||
1 | #ifndef _BCACHE_H | ||
2 | #define _BCACHE_H | ||
3 | |||
4 | /* | ||
5 | * SOME HIGH LEVEL CODE DOCUMENTATION: | ||
6 | * | ||
7 | * Bcache mostly works with cache sets, cache devices, and backing devices. | ||
8 | * | ||
9 | * Support for multiple cache devices hasn't quite been finished off yet, but | ||
10 | * it's about 95% plumbed through. A cache set and its cache devices is sort of | ||
11 | * like a md raid array and its component devices. Most of the code doesn't care | ||
12 | * about individual cache devices, the main abstraction is the cache set. | ||
13 | * | ||
14 | * Multiple cache devices is intended to give us the ability to mirror dirty | ||
15 | * cached data and metadata, without mirroring clean cached data. | ||
16 | * | ||
17 | * Backing devices are different, in that they have a lifetime independent of a | ||
18 | * cache set. When you register a newly formatted backing device it'll come up | ||
19 | * in passthrough mode, and then you can attach and detach a backing device from | ||
20 | * a cache set at runtime - while it's mounted and in use. Detaching implicitly | ||
21 | * invalidates any cached data for that backing device. | ||
22 | * | ||
23 | * A cache set can have multiple (many) backing devices attached to it. | ||
24 | * | ||
25 | * There's also flash only volumes - this is the reason for the distinction | ||
26 | * between struct cached_dev and struct bcache_device. A flash only volume | ||
27 | * works much like a bcache device that has a backing device, except the | ||
28 | * "cached" data is always dirty. The end result is that we get thin | ||
29 | * provisioning with very little additional code. | ||
30 | * | ||
31 | * Flash only volumes work but they're not production ready because the moving | ||
32 | * garbage collector needs more work. More on that later. | ||
33 | * | ||
34 | * BUCKETS/ALLOCATION: | ||
35 | * | ||
36 | * Bcache is primarily designed for caching, which means that in normal | ||
37 | * operation all of our available space will be allocated. Thus, we need an | ||
38 | * efficient way of deleting things from the cache so we can write new things to | ||
39 | * it. | ||
40 | * | ||
41 | * To do this, we first divide the cache device up into buckets. A bucket is the | ||
42 | * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ | ||
43 | * works efficiently. | ||
44 | * | ||
45 | * Each bucket has a 16 bit priority, and an 8 bit generation associated with | ||
46 | * it. The gens and priorities for all the buckets are stored contiguously and | ||
47 | * packed on disk (in a linked list of buckets - aside from the superblock, all | ||
48 | * of bcache's metadata is stored in buckets). | ||
49 | * | ||
50 | * The priority is used to implement an LRU. We reset a bucket's priority when | ||
51 | * we allocate it or on cache it, and every so often we decrement the priority | ||
52 | * of each bucket. It could be used to implement something more sophisticated, | ||
53 | * if anyone ever gets around to it. | ||
54 | * | ||
55 | * The generation is used for invalidating buckets. Each pointer also has an 8 | ||
56 | * bit generation embedded in it; for a pointer to be considered valid, its gen | ||
57 | * must match the gen of the bucket it points into. Thus, to reuse a bucket all | ||
58 | * we have to do is increment its gen (and write its new gen to disk; we batch | ||
59 | * this up). | ||
60 | * | ||
61 | * Bcache is entirely COW - we never write twice to a bucket, even buckets that | ||
62 | * contain metadata (including btree nodes). | ||
63 | * | ||
64 | * THE BTREE: | ||
65 | * | ||
66 | * Bcache is in large part design around the btree. | ||
67 | * | ||
68 | * At a high level, the btree is just an index of key -> ptr tuples. | ||
69 | * | ||
70 | * Keys represent extents, and thus have a size field. Keys also have a variable | ||
71 | * number of pointers attached to them (potentially zero, which is handy for | ||
72 | * invalidating the cache). | ||
73 | * | ||
74 | * The key itself is an inode:offset pair. The inode number corresponds to a | ||
75 | * backing device or a flash only volume. The offset is the ending offset of the | ||
76 | * extent within the inode - not the starting offset; this makes lookups | ||
77 | * slightly more convenient. | ||
78 | * | ||
79 | * Pointers contain the cache device id, the offset on that device, and an 8 bit | ||
80 | * generation number. More on the gen later. | ||
81 | * | ||
82 | * Index lookups are not fully abstracted - cache lookups in particular are | ||
83 | * still somewhat mixed in with the btree code, but things are headed in that | ||
84 | * direction. | ||
85 | * | ||
86 | * Updates are fairly well abstracted, though. There are two different ways of | ||
87 | * updating the btree; insert and replace. | ||
88 | * | ||
89 | * BTREE_INSERT will just take a list of keys and insert them into the btree - | ||
90 | * overwriting (possibly only partially) any extents they overlap with. This is | ||
91 | * used to update the index after a write. | ||
92 | * | ||
93 | * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is | ||
94 | * overwriting a key that matches another given key. This is used for inserting | ||
95 | * data into the cache after a cache miss, and for background writeback, and for | ||
96 | * the moving garbage collector. | ||
97 | * | ||
98 | * There is no "delete" operation; deleting things from the index is | ||
99 | * accomplished by either by invalidating pointers (by incrementing a bucket's | ||
100 | * gen) or by inserting a key with 0 pointers - which will overwrite anything | ||
101 | * previously present at that location in the index. | ||
102 | * | ||
103 | * This means that there are always stale/invalid keys in the btree. They're | ||
104 | * filtered out by the code that iterates through a btree node, and removed when | ||
105 | * a btree node is rewritten. | ||
106 | * | ||
107 | * BTREE NODES: | ||
108 | * | ||
109 | * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and | ||
110 | * free smaller than a bucket - so, that's how big our btree nodes are. | ||
111 | * | ||
112 | * (If buckets are really big we'll only use part of the bucket for a btree node | ||
113 | * - no less than 1/4th - but a bucket still contains no more than a single | ||
114 | * btree node. I'd actually like to change this, but for now we rely on the | ||
115 | * bucket's gen for deleting btree nodes when we rewrite/split a node.) | ||
116 | * | ||
117 | * Anyways, btree nodes are big - big enough to be inefficient with a textbook | ||
118 | * btree implementation. | ||
119 | * | ||
120 | * The way this is solved is that btree nodes are internally log structured; we | ||
121 | * can append new keys to an existing btree node without rewriting it. This | ||
122 | * means each set of keys we write is sorted, but the node is not. | ||
123 | * | ||
124 | * We maintain this log structure in memory - keeping 1Mb of keys sorted would | ||
125 | * be expensive, and we have to distinguish between the keys we have written and | ||
126 | * the keys we haven't. So to do a lookup in a btree node, we have to search | ||
127 | * each sorted set. But we do merge written sets together lazily, so the cost of | ||
128 | * these extra searches is quite low (normally most of the keys in a btree node | ||
129 | * will be in one big set, and then there'll be one or two sets that are much | ||
130 | * smaller). | ||
131 | * | ||
132 | * This log structure makes bcache's btree more of a hybrid between a | ||
133 | * conventional btree and a compacting data structure, with some of the | ||
134 | * advantages of both. | ||
135 | * | ||
136 | * GARBAGE COLLECTION: | ||
137 | * | ||
138 | * We can't just invalidate any bucket - it might contain dirty data or | ||
139 | * metadata. If it once contained dirty data, other writes might overwrite it | ||
140 | * later, leaving no valid pointers into that bucket in the index. | ||
141 | * | ||
142 | * Thus, the primary purpose of garbage collection is to find buckets to reuse. | ||
143 | * It also counts how much valid data it each bucket currently contains, so that | ||
144 | * allocation can reuse buckets sooner when they've been mostly overwritten. | ||
145 | * | ||
146 | * It also does some things that are really internal to the btree | ||
147 | * implementation. If a btree node contains pointers that are stale by more than | ||
148 | * some threshold, it rewrites the btree node to avoid the bucket's generation | ||
149 | * wrapping around. It also merges adjacent btree nodes if they're empty enough. | ||
150 | * | ||
151 | * THE JOURNAL: | ||
152 | * | ||
153 | * Bcache's journal is not necessary for consistency; we always strictly | ||
154 | * order metadata writes so that the btree and everything else is consistent on | ||
155 | * disk in the event of an unclean shutdown, and in fact bcache had writeback | ||
156 | * caching (with recovery from unclean shutdown) before journalling was | ||
157 | * implemented. | ||
158 | * | ||
159 | * Rather, the journal is purely a performance optimization; we can't complete a | ||
160 | * write until we've updated the index on disk, otherwise the cache would be | ||
161 | * inconsistent in the event of an unclean shutdown. This means that without the | ||
162 | * journal, on random write workloads we constantly have to update all the leaf | ||
163 | * nodes in the btree, and those writes will be mostly empty (appending at most | ||
164 | * a few keys each) - highly inefficient in terms of amount of metadata writes, | ||
165 | * and it puts more strain on the various btree resorting/compacting code. | ||
166 | * | ||
167 | * The journal is just a log of keys we've inserted; on startup we just reinsert | ||
168 | * all the keys in the open journal entries. That means that when we're updating | ||
169 | * a node in the btree, we can wait until a 4k block of keys fills up before | ||
170 | * writing them out. | ||
171 | * | ||
172 | * For simplicity, we only journal updates to leaf nodes; updates to parent | ||
173 | * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth | ||
174 | * the complexity to deal with journalling them (in particular, journal replay) | ||
175 | * - updates to non leaf nodes just happen synchronously (see btree_split()). | ||
176 | */ | ||
177 | |||
178 | #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ | ||
179 | |||
180 | #include <linux/bio.h> | ||
181 | #include <linux/blktrace_api.h> | ||
182 | #include <linux/kobject.h> | ||
183 | #include <linux/list.h> | ||
184 | #include <linux/mutex.h> | ||
185 | #include <linux/rbtree.h> | ||
186 | #include <linux/rwsem.h> | ||
187 | #include <linux/types.h> | ||
188 | #include <linux/workqueue.h> | ||
189 | |||
190 | #include "util.h" | ||
191 | #include "closure.h" | ||
192 | |||
193 | struct bucket { | ||
194 | atomic_t pin; | ||
195 | uint16_t prio; | ||
196 | uint8_t gen; | ||
197 | uint8_t disk_gen; | ||
198 | uint8_t last_gc; /* Most out of date gen in the btree */ | ||
199 | uint8_t gc_gen; | ||
200 | uint16_t gc_mark; | ||
201 | }; | ||
202 | |||
203 | /* | ||
204 | * I'd use bitfields for these, but I don't trust the compiler not to screw me | ||
205 | * as multiple threads touch struct bucket without locking | ||
206 | */ | ||
207 | |||
208 | BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); | ||
209 | #define GC_MARK_RECLAIMABLE 0 | ||
210 | #define GC_MARK_DIRTY 1 | ||
211 | #define GC_MARK_METADATA 2 | ||
212 | BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); | ||
213 | |||
214 | struct bkey { | ||
215 | uint64_t high; | ||
216 | uint64_t low; | ||
217 | uint64_t ptr[]; | ||
218 | }; | ||
219 | |||
220 | /* Enough for a key with 6 pointers */ | ||
221 | #define BKEY_PAD 8 | ||
222 | |||
223 | #define BKEY_PADDED(key) \ | ||
224 | union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; } | ||
225 | |||
226 | /* Version 0: Cache device | ||
227 | * Version 1: Backing device | ||
228 | * Version 2: Seed pointer into btree node checksum | ||
229 | * Version 3: Cache device with new UUID format | ||
230 | * Version 4: Backing device with data offset | ||
231 | */ | ||
232 | #define BCACHE_SB_VERSION_CDEV 0 | ||
233 | #define BCACHE_SB_VERSION_BDEV 1 | ||
234 | #define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 | ||
235 | #define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 | ||
236 | #define BCACHE_SB_MAX_VERSION 4 | ||
237 | |||
238 | #define SB_SECTOR 8 | ||
239 | #define SB_SIZE 4096 | ||
240 | #define SB_LABEL_SIZE 32 | ||
241 | #define SB_JOURNAL_BUCKETS 256U | ||
242 | /* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ | ||
243 | #define MAX_CACHES_PER_SET 8 | ||
244 | |||
245 | #define BDEV_DATA_START_DEFAULT 16 /* sectors */ | ||
246 | |||
247 | struct cache_sb { | ||
248 | uint64_t csum; | ||
249 | uint64_t offset; /* sector where this sb was written */ | ||
250 | uint64_t version; | ||
251 | |||
252 | uint8_t magic[16]; | ||
253 | |||
254 | uint8_t uuid[16]; | ||
255 | union { | ||
256 | uint8_t set_uuid[16]; | ||
257 | uint64_t set_magic; | ||
258 | }; | ||
259 | uint8_t label[SB_LABEL_SIZE]; | ||
260 | |||
261 | uint64_t flags; | ||
262 | uint64_t seq; | ||
263 | uint64_t pad[8]; | ||
264 | |||
265 | union { | ||
266 | struct { | ||
267 | /* Cache devices */ | ||
268 | uint64_t nbuckets; /* device size */ | ||
269 | |||
270 | uint16_t block_size; /* sectors */ | ||
271 | uint16_t bucket_size; /* sectors */ | ||
272 | |||
273 | uint16_t nr_in_set; | ||
274 | uint16_t nr_this_dev; | ||
275 | }; | ||
276 | struct { | ||
277 | /* Backing devices */ | ||
278 | uint64_t data_offset; | ||
279 | |||
280 | /* | ||
281 | * block_size from the cache device section is still used by | ||
282 | * backing devices, so don't add anything here until we fix | ||
283 | * things to not need it for backing devices anymore | ||
284 | */ | ||
285 | }; | ||
286 | }; | ||
287 | |||
288 | uint32_t last_mount; /* time_t */ | ||
289 | |||
290 | uint16_t first_bucket; | ||
291 | union { | ||
292 | uint16_t njournal_buckets; | ||
293 | uint16_t keys; | ||
294 | }; | ||
295 | uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */ | ||
296 | }; | ||
297 | |||
298 | BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); | ||
299 | BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); | ||
300 | BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); | ||
301 | #define CACHE_REPLACEMENT_LRU 0U | ||
302 | #define CACHE_REPLACEMENT_FIFO 1U | ||
303 | #define CACHE_REPLACEMENT_RANDOM 2U | ||
304 | |||
305 | BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); | ||
306 | #define CACHE_MODE_WRITETHROUGH 0U | ||
307 | #define CACHE_MODE_WRITEBACK 1U | ||
308 | #define CACHE_MODE_WRITEAROUND 2U | ||
309 | #define CACHE_MODE_NONE 3U | ||
310 | BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); | ||
311 | #define BDEV_STATE_NONE 0U | ||
312 | #define BDEV_STATE_CLEAN 1U | ||
313 | #define BDEV_STATE_DIRTY 2U | ||
314 | #define BDEV_STATE_STALE 3U | ||
315 | |||
316 | /* Version 1: Seed pointer into btree node checksum | ||
317 | */ | ||
318 | #define BCACHE_BSET_VERSION 1 | ||
319 | |||
320 | /* | ||
321 | * This is the on disk format for btree nodes - a btree node on disk is a list | ||
322 | * of these; within each set the keys are sorted | ||
323 | */ | ||
324 | struct bset { | ||
325 | uint64_t csum; | ||
326 | uint64_t magic; | ||
327 | uint64_t seq; | ||
328 | uint32_t version; | ||
329 | uint32_t keys; | ||
330 | |||
331 | union { | ||
332 | struct bkey start[0]; | ||
333 | uint64_t d[0]; | ||
334 | }; | ||
335 | }; | ||
336 | |||
337 | /* | ||
338 | * On disk format for priorities and gens - see super.c near prio_write() for | ||
339 | * more. | ||
340 | */ | ||
341 | struct prio_set { | ||
342 | uint64_t csum; | ||
343 | uint64_t magic; | ||
344 | uint64_t seq; | ||
345 | uint32_t version; | ||
346 | uint32_t pad; | ||
347 | |||
348 | uint64_t next_bucket; | ||
349 | |||
350 | struct bucket_disk { | ||
351 | uint16_t prio; | ||
352 | uint8_t gen; | ||
353 | } __attribute((packed)) data[]; | ||
354 | }; | ||
355 | |||
356 | struct uuid_entry { | ||
357 | union { | ||
358 | struct { | ||
359 | uint8_t uuid[16]; | ||
360 | uint8_t label[32]; | ||
361 | uint32_t first_reg; | ||
362 | uint32_t last_reg; | ||
363 | uint32_t invalidated; | ||
364 | |||
365 | uint32_t flags; | ||
366 | /* Size of flash only volumes */ | ||
367 | uint64_t sectors; | ||
368 | }; | ||
369 | |||
370 | uint8_t pad[128]; | ||
371 | }; | ||
372 | }; | ||
373 | |||
374 | BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); | ||
375 | |||
376 | #include "journal.h" | ||
377 | #include "stats.h" | ||
378 | struct search; | ||
379 | struct btree; | ||
380 | struct keybuf; | ||
381 | |||
382 | struct keybuf_key { | ||
383 | struct rb_node node; | ||
384 | BKEY_PADDED(key); | ||
385 | void *private; | ||
386 | }; | ||
387 | |||
388 | typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); | ||
389 | |||
390 | struct keybuf { | ||
391 | keybuf_pred_fn *key_predicate; | ||
392 | |||
393 | struct bkey last_scanned; | ||
394 | spinlock_t lock; | ||
395 | |||
396 | /* | ||
397 | * Beginning and end of range in rb tree - so that we can skip taking | ||
398 | * lock and checking the rb tree when we need to check for overlapping | ||
399 | * keys. | ||
400 | */ | ||
401 | struct bkey start; | ||
402 | struct bkey end; | ||
403 | |||
404 | struct rb_root keys; | ||
405 | |||
406 | #define KEYBUF_NR 100 | ||
407 | DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); | ||
408 | }; | ||
409 | |||
410 | struct bio_split_pool { | ||
411 | struct bio_set *bio_split; | ||
412 | mempool_t *bio_split_hook; | ||
413 | }; | ||
414 | |||
415 | struct bio_split_hook { | ||
416 | struct closure cl; | ||
417 | struct bio_split_pool *p; | ||
418 | struct bio *bio; | ||
419 | bio_end_io_t *bi_end_io; | ||
420 | void *bi_private; | ||
421 | }; | ||
422 | |||
423 | struct bcache_device { | ||
424 | struct closure cl; | ||
425 | |||
426 | struct kobject kobj; | ||
427 | |||
428 | struct cache_set *c; | ||
429 | unsigned id; | ||
430 | #define BCACHEDEVNAME_SIZE 12 | ||
431 | char name[BCACHEDEVNAME_SIZE]; | ||
432 | |||
433 | struct gendisk *disk; | ||
434 | |||
435 | /* If nonzero, we're closing */ | ||
436 | atomic_t closing; | ||
437 | |||
438 | /* If nonzero, we're detaching/unregistering from cache set */ | ||
439 | atomic_t detaching; | ||
440 | |||
441 | atomic_long_t sectors_dirty; | ||
442 | unsigned long sectors_dirty_gc; | ||
443 | unsigned long sectors_dirty_last; | ||
444 | long sectors_dirty_derivative; | ||
445 | |||
446 | mempool_t *unaligned_bvec; | ||
447 | struct bio_set *bio_split; | ||
448 | |||
449 | unsigned data_csum:1; | ||
450 | |||
451 | int (*cache_miss)(struct btree *, struct search *, | ||
452 | struct bio *, unsigned); | ||
453 | int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long); | ||
454 | |||
455 | struct bio_split_pool bio_split_hook; | ||
456 | }; | ||
457 | |||
458 | struct io { | ||
459 | /* Used to track sequential IO so it can be skipped */ | ||
460 | struct hlist_node hash; | ||
461 | struct list_head lru; | ||
462 | |||
463 | unsigned long jiffies; | ||
464 | unsigned sequential; | ||
465 | sector_t last; | ||
466 | }; | ||
467 | |||
468 | struct cached_dev { | ||
469 | struct list_head list; | ||
470 | struct bcache_device disk; | ||
471 | struct block_device *bdev; | ||
472 | |||
473 | struct cache_sb sb; | ||
474 | struct bio sb_bio; | ||
475 | struct bio_vec sb_bv[1]; | ||
476 | struct closure_with_waitlist sb_write; | ||
477 | |||
478 | /* Refcount on the cache set. Always nonzero when we're caching. */ | ||
479 | atomic_t count; | ||
480 | struct work_struct detach; | ||
481 | |||
482 | /* | ||
483 | * Device might not be running if it's dirty and the cache set hasn't | ||
484 | * showed up yet. | ||
485 | */ | ||
486 | atomic_t running; | ||
487 | |||
488 | /* | ||
489 | * Writes take a shared lock from start to finish; scanning for dirty | ||
490 | * data to refill the rb tree requires an exclusive lock. | ||
491 | */ | ||
492 | struct rw_semaphore writeback_lock; | ||
493 | |||
494 | /* | ||
495 | * Nonzero, and writeback has a refcount (d->count), iff there is dirty | ||
496 | * data in the cache. Protected by writeback_lock; must have an | ||
497 | * shared lock to set and exclusive lock to clear. | ||
498 | */ | ||
499 | atomic_t has_dirty; | ||
500 | |||
501 | struct ratelimit writeback_rate; | ||
502 | struct delayed_work writeback_rate_update; | ||
503 | |||
504 | /* | ||
505 | * Internal to the writeback code, so read_dirty() can keep track of | ||
506 | * where it's at. | ||
507 | */ | ||
508 | sector_t last_read; | ||
509 | |||
510 | /* Number of writeback bios in flight */ | ||
511 | atomic_t in_flight; | ||
512 | struct closure_with_timer writeback; | ||
513 | struct closure_waitlist writeback_wait; | ||
514 | |||
515 | struct keybuf writeback_keys; | ||
516 | |||
517 | /* For tracking sequential IO */ | ||
518 | #define RECENT_IO_BITS 7 | ||
519 | #define RECENT_IO (1 << RECENT_IO_BITS) | ||
520 | struct io io[RECENT_IO]; | ||
521 | struct hlist_head io_hash[RECENT_IO + 1]; | ||
522 | struct list_head io_lru; | ||
523 | spinlock_t io_lock; | ||
524 | |||
525 | struct cache_accounting accounting; | ||
526 | |||
527 | /* The rest of this all shows up in sysfs */ | ||
528 | unsigned sequential_cutoff; | ||
529 | unsigned readahead; | ||
530 | |||
531 | unsigned sequential_merge:1; | ||
532 | unsigned verify:1; | ||
533 | |||
534 | unsigned writeback_metadata:1; | ||
535 | unsigned writeback_running:1; | ||
536 | unsigned char writeback_percent; | ||
537 | unsigned writeback_delay; | ||
538 | |||
539 | int writeback_rate_change; | ||
540 | int64_t writeback_rate_derivative; | ||
541 | uint64_t writeback_rate_target; | ||
542 | |||
543 | unsigned writeback_rate_update_seconds; | ||
544 | unsigned writeback_rate_d_term; | ||
545 | unsigned writeback_rate_p_term_inverse; | ||
546 | unsigned writeback_rate_d_smooth; | ||
547 | }; | ||
548 | |||
549 | enum alloc_watermarks { | ||
550 | WATERMARK_PRIO, | ||
551 | WATERMARK_METADATA, | ||
552 | WATERMARK_MOVINGGC, | ||
553 | WATERMARK_NONE, | ||
554 | WATERMARK_MAX | ||
555 | }; | ||
556 | |||
557 | struct cache { | ||
558 | struct cache_set *set; | ||
559 | struct cache_sb sb; | ||
560 | struct bio sb_bio; | ||
561 | struct bio_vec sb_bv[1]; | ||
562 | |||
563 | struct kobject kobj; | ||
564 | struct block_device *bdev; | ||
565 | |||
566 | unsigned watermark[WATERMARK_MAX]; | ||
567 | |||
568 | struct closure alloc; | ||
569 | struct workqueue_struct *alloc_workqueue; | ||
570 | |||
571 | struct closure prio; | ||
572 | struct prio_set *disk_buckets; | ||
573 | |||
574 | /* | ||
575 | * When allocating new buckets, prio_write() gets first dibs - since we | ||
576 | * may not be allocate at all without writing priorities and gens. | ||
577 | * prio_buckets[] contains the last buckets we wrote priorities to (so | ||
578 | * gc can mark them as metadata), prio_next[] contains the buckets | ||
579 | * allocated for the next prio write. | ||
580 | */ | ||
581 | uint64_t *prio_buckets; | ||
582 | uint64_t *prio_last_buckets; | ||
583 | |||
584 | /* | ||
585 | * free: Buckets that are ready to be used | ||
586 | * | ||
587 | * free_inc: Incoming buckets - these are buckets that currently have | ||
588 | * cached data in them, and we can't reuse them until after we write | ||
589 | * their new gen to disk. After prio_write() finishes writing the new | ||
590 | * gens/prios, they'll be moved to the free list (and possibly discarded | ||
591 | * in the process) | ||
592 | * | ||
593 | * unused: GC found nothing pointing into these buckets (possibly | ||
594 | * because all the data they contained was overwritten), so we only | ||
595 | * need to discard them before they can be moved to the free list. | ||
596 | */ | ||
597 | DECLARE_FIFO(long, free); | ||
598 | DECLARE_FIFO(long, free_inc); | ||
599 | DECLARE_FIFO(long, unused); | ||
600 | |||
601 | size_t fifo_last_bucket; | ||
602 | |||
603 | /* Allocation stuff: */ | ||
604 | struct bucket *buckets; | ||
605 | |||
606 | DECLARE_HEAP(struct bucket *, heap); | ||
607 | |||
608 | /* | ||
609 | * max(gen - disk_gen) for all buckets. When it gets too big we have to | ||
610 | * call prio_write() to keep gens from wrapping. | ||
611 | */ | ||
612 | uint8_t need_save_prio; | ||
613 | unsigned gc_move_threshold; | ||
614 | |||
615 | /* | ||
616 | * If nonzero, we know we aren't going to find any buckets to invalidate | ||
617 | * until a gc finishes - otherwise we could pointlessly burn a ton of | ||
618 | * cpu | ||
619 | */ | ||
620 | unsigned invalidate_needs_gc:1; | ||
621 | |||
622 | bool discard; /* Get rid of? */ | ||
623 | |||
624 | /* | ||
625 | * We preallocate structs for issuing discards to buckets, and keep them | ||
626 | * on this list when they're not in use; do_discard() issues discards | ||
627 | * whenever there's work to do and is called by free_some_buckets() and | ||
628 | * when a discard finishes. | ||
629 | */ | ||
630 | atomic_t discards_in_flight; | ||
631 | struct list_head discards; | ||
632 | |||
633 | struct journal_device journal; | ||
634 | |||
635 | /* The rest of this all shows up in sysfs */ | ||
636 | #define IO_ERROR_SHIFT 20 | ||
637 | atomic_t io_errors; | ||
638 | atomic_t io_count; | ||
639 | |||
640 | atomic_long_t meta_sectors_written; | ||
641 | atomic_long_t btree_sectors_written; | ||
642 | atomic_long_t sectors_written; | ||
643 | |||
644 | struct bio_split_pool bio_split_hook; | ||
645 | }; | ||
646 | |||
647 | struct gc_stat { | ||
648 | size_t nodes; | ||
649 | size_t key_bytes; | ||
650 | |||
651 | size_t nkeys; | ||
652 | uint64_t data; /* sectors */ | ||
653 | uint64_t dirty; /* sectors */ | ||
654 | unsigned in_use; /* percent */ | ||
655 | }; | ||
656 | |||
657 | /* | ||
658 | * Flag bits, for how the cache set is shutting down, and what phase it's at: | ||
659 | * | ||
660 | * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching | ||
661 | * all the backing devices first (their cached data gets invalidated, and they | ||
662 | * won't automatically reattach). | ||
663 | * | ||
664 | * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; | ||
665 | * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. | ||
666 | * flushing dirty data). | ||
667 | * | ||
668 | * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down | ||
669 | * the allocation thread. | ||
670 | */ | ||
671 | #define CACHE_SET_UNREGISTERING 0 | ||
672 | #define CACHE_SET_STOPPING 1 | ||
673 | #define CACHE_SET_STOPPING_2 2 | ||
674 | |||
675 | struct cache_set { | ||
676 | struct closure cl; | ||
677 | |||
678 | struct list_head list; | ||
679 | struct kobject kobj; | ||
680 | struct kobject internal; | ||
681 | struct dentry *debug; | ||
682 | struct cache_accounting accounting; | ||
683 | |||
684 | unsigned long flags; | ||
685 | |||
686 | struct cache_sb sb; | ||
687 | |||
688 | struct cache *cache[MAX_CACHES_PER_SET]; | ||
689 | struct cache *cache_by_alloc[MAX_CACHES_PER_SET]; | ||
690 | int caches_loaded; | ||
691 | |||
692 | struct bcache_device **devices; | ||
693 | struct list_head cached_devs; | ||
694 | uint64_t cached_dev_sectors; | ||
695 | struct closure caching; | ||
696 | |||
697 | struct closure_with_waitlist sb_write; | ||
698 | |||
699 | mempool_t *search; | ||
700 | mempool_t *bio_meta; | ||
701 | struct bio_set *bio_split; | ||
702 | |||
703 | /* For the btree cache */ | ||
704 | struct shrinker shrink; | ||
705 | |||
706 | /* For the allocator itself */ | ||
707 | wait_queue_head_t alloc_wait; | ||
708 | |||
709 | /* For the btree cache and anything allocation related */ | ||
710 | struct mutex bucket_lock; | ||
711 | |||
712 | /* log2(bucket_size), in sectors */ | ||
713 | unsigned short bucket_bits; | ||
714 | |||
715 | /* log2(block_size), in sectors */ | ||
716 | unsigned short block_bits; | ||
717 | |||
718 | /* | ||
719 | * Default number of pages for a new btree node - may be less than a | ||
720 | * full bucket | ||
721 | */ | ||
722 | unsigned btree_pages; | ||
723 | |||
724 | /* | ||
725 | * Lists of struct btrees; lru is the list for structs that have memory | ||
726 | * allocated for actual btree node, freed is for structs that do not. | ||
727 | * | ||
728 | * We never free a struct btree, except on shutdown - we just put it on | ||
729 | * the btree_cache_freed list and reuse it later. This simplifies the | ||
730 | * code, and it doesn't cost us much memory as the memory usage is | ||
731 | * dominated by buffers that hold the actual btree node data and those | ||
732 | * can be freed - and the number of struct btrees allocated is | ||
733 | * effectively bounded. | ||
734 | * | ||
735 | * btree_cache_freeable effectively is a small cache - we use it because | ||
736 | * high order page allocations can be rather expensive, and it's quite | ||
737 | * common to delete and allocate btree nodes in quick succession. It | ||
738 | * should never grow past ~2-3 nodes in practice. | ||
739 | */ | ||
740 | struct list_head btree_cache; | ||
741 | struct list_head btree_cache_freeable; | ||
742 | struct list_head btree_cache_freed; | ||
743 | |||
744 | /* Number of elements in btree_cache + btree_cache_freeable lists */ | ||
745 | unsigned bucket_cache_used; | ||
746 | |||
747 | /* | ||
748 | * If we need to allocate memory for a new btree node and that | ||
749 | * allocation fails, we can cannibalize another node in the btree cache | ||
750 | * to satisfy the allocation. However, only one thread can be doing this | ||
751 | * at a time, for obvious reasons - try_harder and try_wait are | ||
752 | * basically a lock for this that we can wait on asynchronously. The | ||
753 | * btree_root() macro releases the lock when it returns. | ||
754 | */ | ||
755 | struct closure *try_harder; | ||
756 | struct closure_waitlist try_wait; | ||
757 | uint64_t try_harder_start; | ||
758 | |||
759 | /* | ||
760 | * When we free a btree node, we increment the gen of the bucket the | ||
761 | * node is in - but we can't rewrite the prios and gens until we | ||
762 | * finished whatever it is we were doing, otherwise after a crash the | ||
763 | * btree node would be freed but for say a split, we might not have the | ||
764 | * pointers to the new nodes inserted into the btree yet. | ||
765 | * | ||
766 | * This is a refcount that blocks prio_write() until the new keys are | ||
767 | * written. | ||
768 | */ | ||
769 | atomic_t prio_blocked; | ||
770 | struct closure_waitlist bucket_wait; | ||
771 | |||
772 | /* | ||
773 | * For any bio we don't skip we subtract the number of sectors from | ||
774 | * rescale; when it hits 0 we rescale all the bucket priorities. | ||
775 | */ | ||
776 | atomic_t rescale; | ||
777 | /* | ||
778 | * When we invalidate buckets, we use both the priority and the amount | ||
779 | * of good data to determine which buckets to reuse first - to weight | ||
780 | * those together consistently we keep track of the smallest nonzero | ||
781 | * priority of any bucket. | ||
782 | */ | ||
783 | uint16_t min_prio; | ||
784 | |||
785 | /* | ||
786 | * max(gen - gc_gen) for all buckets. When it gets too big we have to gc | ||
787 | * to keep gens from wrapping around. | ||
788 | */ | ||
789 | uint8_t need_gc; | ||
790 | struct gc_stat gc_stats; | ||
791 | size_t nbuckets; | ||
792 | |||
793 | struct closure_with_waitlist gc; | ||
794 | /* Where in the btree gc currently is */ | ||
795 | struct bkey gc_done; | ||
796 | |||
797 | /* | ||
798 | * The allocation code needs gc_mark in struct bucket to be correct, but | ||
799 | * it's not while a gc is in progress. Protected by bucket_lock. | ||
800 | */ | ||
801 | int gc_mark_valid; | ||
802 | |||
803 | /* Counts how many sectors bio_insert has added to the cache */ | ||
804 | atomic_t sectors_to_gc; | ||
805 | |||
806 | struct closure moving_gc; | ||
807 | struct closure_waitlist moving_gc_wait; | ||
808 | struct keybuf moving_gc_keys; | ||
809 | /* Number of moving GC bios in flight */ | ||
810 | atomic_t in_flight; | ||
811 | |||
812 | struct btree *root; | ||
813 | |||
814 | #ifdef CONFIG_BCACHE_DEBUG | ||
815 | struct btree *verify_data; | ||
816 | struct mutex verify_lock; | ||
817 | #endif | ||
818 | |||
819 | unsigned nr_uuids; | ||
820 | struct uuid_entry *uuids; | ||
821 | BKEY_PADDED(uuid_bucket); | ||
822 | struct closure_with_waitlist uuid_write; | ||
823 | |||
824 | /* | ||
825 | * A btree node on disk could have too many bsets for an iterator to fit | ||
826 | * on the stack - this is a single element mempool for btree_read_work() | ||
827 | */ | ||
828 | struct mutex fill_lock; | ||
829 | struct btree_iter *fill_iter; | ||
830 | |||
831 | /* | ||
832 | * btree_sort() is a merge sort and requires temporary space - single | ||
833 | * element mempool | ||
834 | */ | ||
835 | struct mutex sort_lock; | ||
836 | struct bset *sort; | ||
837 | |||
838 | /* List of buckets we're currently writing data to */ | ||
839 | struct list_head data_buckets; | ||
840 | spinlock_t data_bucket_lock; | ||
841 | |||
842 | struct journal journal; | ||
843 | |||
844 | #define CONGESTED_MAX 1024 | ||
845 | unsigned congested_last_us; | ||
846 | atomic_t congested; | ||
847 | |||
848 | /* The rest of this all shows up in sysfs */ | ||
849 | unsigned congested_read_threshold_us; | ||
850 | unsigned congested_write_threshold_us; | ||
851 | |||
852 | spinlock_t sort_time_lock; | ||
853 | struct time_stats sort_time; | ||
854 | struct time_stats btree_gc_time; | ||
855 | struct time_stats btree_split_time; | ||
856 | spinlock_t btree_read_time_lock; | ||
857 | struct time_stats btree_read_time; | ||
858 | struct time_stats try_harder_time; | ||
859 | |||
860 | atomic_long_t cache_read_races; | ||
861 | atomic_long_t writeback_keys_done; | ||
862 | atomic_long_t writeback_keys_failed; | ||
863 | unsigned error_limit; | ||
864 | unsigned error_decay; | ||
865 | unsigned short journal_delay_ms; | ||
866 | unsigned verify:1; | ||
867 | unsigned key_merging_disabled:1; | ||
868 | unsigned gc_always_rewrite:1; | ||
869 | unsigned shrinker_disabled:1; | ||
870 | unsigned copy_gc_enabled:1; | ||
871 | |||
872 | #define BUCKET_HASH_BITS 12 | ||
873 | struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; | ||
874 | }; | ||
875 | |||
876 | static inline bool key_merging_disabled(struct cache_set *c) | ||
877 | { | ||
878 | #ifdef CONFIG_BCACHE_DEBUG | ||
879 | return c->key_merging_disabled; | ||
880 | #else | ||
881 | return 0; | ||
882 | #endif | ||
883 | } | ||
884 | |||
885 | static inline bool SB_IS_BDEV(const struct cache_sb *sb) | ||
886 | { | ||
887 | return sb->version == BCACHE_SB_VERSION_BDEV | ||
888 | || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; | ||
889 | } | ||
890 | |||
891 | struct bbio { | ||
892 | unsigned submit_time_us; | ||
893 | union { | ||
894 | struct bkey key; | ||
895 | uint64_t _pad[3]; | ||
896 | /* | ||
897 | * We only need pad = 3 here because we only ever carry around a | ||
898 | * single pointer - i.e. the pointer we're doing io to/from. | ||
899 | */ | ||
900 | }; | ||
901 | struct bio bio; | ||
902 | }; | ||
903 | |||
904 | static inline unsigned local_clock_us(void) | ||
905 | { | ||
906 | return local_clock() >> 10; | ||
907 | } | ||
908 | |||
909 | #define MAX_BSETS 4U | ||
910 | |||
911 | #define BTREE_PRIO USHRT_MAX | ||
912 | #define INITIAL_PRIO 32768 | ||
913 | |||
914 | #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) | ||
915 | #define btree_blocks(b) \ | ||
916 | ((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits)) | ||
917 | |||
918 | #define btree_default_blocks(c) \ | ||
919 | ((unsigned) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits)) | ||
920 | |||
921 | #define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS) | ||
922 | #define bucket_bytes(c) ((c)->sb.bucket_size << 9) | ||
923 | #define block_bytes(c) ((c)->sb.block_size << 9) | ||
924 | |||
925 | #define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) | ||
926 | #define set_bytes(i) __set_bytes(i, i->keys) | ||
927 | |||
928 | #define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c)) | ||
929 | #define set_blocks(i, c) __set_blocks(i, (i)->keys, c) | ||
930 | |||
931 | #define node(i, j) ((struct bkey *) ((i)->d + (j))) | ||
932 | #define end(i) node(i, (i)->keys) | ||
933 | |||
934 | #define index(i, b) \ | ||
935 | ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \ | ||
936 | block_bytes(b->c))) | ||
937 | |||
938 | #define btree_data_space(b) (PAGE_SIZE << (b)->page_order) | ||
939 | |||
940 | #define prios_per_bucket(c) \ | ||
941 | ((bucket_bytes(c) - sizeof(struct prio_set)) / \ | ||
942 | sizeof(struct bucket_disk)) | ||
943 | #define prio_buckets(c) \ | ||
944 | DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) | ||
945 | |||
946 | #define JSET_MAGIC 0x245235c1a3625032ULL | ||
947 | #define PSET_MAGIC 0x6750e15f87337f91ULL | ||
948 | #define BSET_MAGIC 0x90135c78b99e07f5ULL | ||
949 | |||
950 | #define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC) | ||
951 | #define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC) | ||
952 | #define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC) | ||
953 | |||
954 | /* Bkey fields: all units are in sectors */ | ||
955 | |||
956 | #define KEY_FIELD(name, field, offset, size) \ | ||
957 | BITMASK(name, struct bkey, field, offset, size) | ||
958 | |||
959 | #define PTR_FIELD(name, offset, size) \ | ||
960 | static inline uint64_t name(const struct bkey *k, unsigned i) \ | ||
961 | { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \ | ||
962 | \ | ||
963 | static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\ | ||
964 | { \ | ||
965 | k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \ | ||
966 | k->ptr[i] |= v << offset; \ | ||
967 | } | ||
968 | |||
969 | KEY_FIELD(KEY_PTRS, high, 60, 3) | ||
970 | KEY_FIELD(HEADER_SIZE, high, 58, 2) | ||
971 | KEY_FIELD(KEY_CSUM, high, 56, 2) | ||
972 | KEY_FIELD(KEY_PINNED, high, 55, 1) | ||
973 | KEY_FIELD(KEY_DIRTY, high, 36, 1) | ||
974 | |||
975 | KEY_FIELD(KEY_SIZE, high, 20, 16) | ||
976 | KEY_FIELD(KEY_INODE, high, 0, 20) | ||
977 | |||
978 | /* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ | ||
979 | |||
980 | static inline uint64_t KEY_OFFSET(const struct bkey *k) | ||
981 | { | ||
982 | return k->low; | ||
983 | } | ||
984 | |||
985 | static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v) | ||
986 | { | ||
987 | k->low = v; | ||
988 | } | ||
989 | |||
990 | PTR_FIELD(PTR_DEV, 51, 12) | ||
991 | PTR_FIELD(PTR_OFFSET, 8, 43) | ||
992 | PTR_FIELD(PTR_GEN, 0, 8) | ||
993 | |||
994 | #define PTR_CHECK_DEV ((1 << 12) - 1) | ||
995 | |||
996 | #define PTR(gen, offset, dev) \ | ||
997 | ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen) | ||
998 | |||
999 | static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) | ||
1000 | { | ||
1001 | return s >> c->bucket_bits; | ||
1002 | } | ||
1003 | |||
1004 | static inline sector_t bucket_to_sector(struct cache_set *c, size_t b) | ||
1005 | { | ||
1006 | return ((sector_t) b) << c->bucket_bits; | ||
1007 | } | ||
1008 | |||
1009 | static inline sector_t bucket_remainder(struct cache_set *c, sector_t s) | ||
1010 | { | ||
1011 | return s & (c->sb.bucket_size - 1); | ||
1012 | } | ||
1013 | |||
1014 | static inline struct cache *PTR_CACHE(struct cache_set *c, | ||
1015 | const struct bkey *k, | ||
1016 | unsigned ptr) | ||
1017 | { | ||
1018 | return c->cache[PTR_DEV(k, ptr)]; | ||
1019 | } | ||
1020 | |||
1021 | static inline size_t PTR_BUCKET_NR(struct cache_set *c, | ||
1022 | const struct bkey *k, | ||
1023 | unsigned ptr) | ||
1024 | { | ||
1025 | return sector_to_bucket(c, PTR_OFFSET(k, ptr)); | ||
1026 | } | ||
1027 | |||
1028 | static inline struct bucket *PTR_BUCKET(struct cache_set *c, | ||
1029 | const struct bkey *k, | ||
1030 | unsigned ptr) | ||
1031 | { | ||
1032 | return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); | ||
1033 | } | ||
1034 | |||
1035 | /* Btree key macros */ | ||
1036 | |||
1037 | /* | ||
1038 | * The high bit being set is a relic from when we used it to do binary | ||
1039 | * searches - it told you where a key started. It's not used anymore, | ||
1040 | * and can probably be safely dropped. | ||
1041 | */ | ||
1042 | #define KEY(dev, sector, len) \ | ||
1043 | ((struct bkey) { \ | ||
1044 | .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \ | ||
1045 | .low = (sector) \ | ||
1046 | }) | ||
1047 | |||
1048 | static inline void bkey_init(struct bkey *k) | ||
1049 | { | ||
1050 | *k = KEY(0, 0, 0); | ||
1051 | } | ||
1052 | |||
1053 | #define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) | ||
1054 | #define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) | ||
1055 | #define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0) | ||
1056 | #define ZERO_KEY KEY(0, 0, 0) | ||
1057 | |||
1058 | /* | ||
1059 | * This is used for various on disk data structures - cache_sb, prio_set, bset, | ||
1060 | * jset: The checksum is _always_ the first 8 bytes of these structs | ||
1061 | */ | ||
1062 | #define csum_set(i) \ | ||
1063 | bch_crc64(((void *) (i)) + sizeof(uint64_t), \ | ||
1064 | ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t))) | ||
1065 | |||
1066 | /* Error handling macros */ | ||
1067 | |||
1068 | #define btree_bug(b, ...) \ | ||
1069 | do { \ | ||
1070 | if (bch_cache_set_error((b)->c, __VA_ARGS__)) \ | ||
1071 | dump_stack(); \ | ||
1072 | } while (0) | ||
1073 | |||
1074 | #define cache_bug(c, ...) \ | ||
1075 | do { \ | ||
1076 | if (bch_cache_set_error(c, __VA_ARGS__)) \ | ||
1077 | dump_stack(); \ | ||
1078 | } while (0) | ||
1079 | |||
1080 | #define btree_bug_on(cond, b, ...) \ | ||
1081 | do { \ | ||
1082 | if (cond) \ | ||
1083 | btree_bug(b, __VA_ARGS__); \ | ||
1084 | } while (0) | ||
1085 | |||
1086 | #define cache_bug_on(cond, c, ...) \ | ||
1087 | do { \ | ||
1088 | if (cond) \ | ||
1089 | cache_bug(c, __VA_ARGS__); \ | ||
1090 | } while (0) | ||
1091 | |||
1092 | #define cache_set_err_on(cond, c, ...) \ | ||
1093 | do { \ | ||
1094 | if (cond) \ | ||
1095 | bch_cache_set_error(c, __VA_ARGS__); \ | ||
1096 | } while (0) | ||
1097 | |||
1098 | /* Looping macros */ | ||
1099 | |||
1100 | #define for_each_cache(ca, cs, iter) \ | ||
1101 | for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++) | ||
1102 | |||
1103 | #define for_each_bucket(b, ca) \ | ||
1104 | for (b = (ca)->buckets + (ca)->sb.first_bucket; \ | ||
1105 | b < (ca)->buckets + (ca)->sb.nbuckets; b++) | ||
1106 | |||
1107 | static inline void __bkey_put(struct cache_set *c, struct bkey *k) | ||
1108 | { | ||
1109 | unsigned i; | ||
1110 | |||
1111 | for (i = 0; i < KEY_PTRS(k); i++) | ||
1112 | atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); | ||
1113 | } | ||
1114 | |||
1115 | /* Blktrace macros */ | ||
1116 | |||
1117 | #define blktrace_msg(c, fmt, ...) \ | ||
1118 | do { \ | ||
1119 | struct request_queue *q = bdev_get_queue(c->bdev); \ | ||
1120 | if (q) \ | ||
1121 | blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \ | ||
1122 | } while (0) | ||
1123 | |||
1124 | #define blktrace_msg_all(s, fmt, ...) \ | ||
1125 | do { \ | ||
1126 | struct cache *_c; \ | ||
1127 | unsigned i; \ | ||
1128 | for_each_cache(_c, (s), i) \ | ||
1129 | blktrace_msg(_c, fmt, ##__VA_ARGS__); \ | ||
1130 | } while (0) | ||
1131 | |||
1132 | static inline void cached_dev_put(struct cached_dev *dc) | ||
1133 | { | ||
1134 | if (atomic_dec_and_test(&dc->count)) | ||
1135 | schedule_work(&dc->detach); | ||
1136 | } | ||
1137 | |||
1138 | static inline bool cached_dev_get(struct cached_dev *dc) | ||
1139 | { | ||
1140 | if (!atomic_inc_not_zero(&dc->count)) | ||
1141 | return false; | ||
1142 | |||
1143 | /* Paired with the mb in cached_dev_attach */ | ||
1144 | smp_mb__after_atomic_inc(); | ||
1145 | return true; | ||
1146 | } | ||
1147 | |||
1148 | /* | ||
1149 | * bucket_gc_gen() returns the difference between the bucket's current gen and | ||
1150 | * the oldest gen of any pointer into that bucket in the btree (last_gc). | ||
1151 | * | ||
1152 | * bucket_disk_gen() returns the difference between the current gen and the gen | ||
1153 | * on disk; they're both used to make sure gens don't wrap around. | ||
1154 | */ | ||
1155 | |||
1156 | static inline uint8_t bucket_gc_gen(struct bucket *b) | ||
1157 | { | ||
1158 | return b->gen - b->last_gc; | ||
1159 | } | ||
1160 | |||
1161 | static inline uint8_t bucket_disk_gen(struct bucket *b) | ||
1162 | { | ||
1163 | return b->gen - b->disk_gen; | ||
1164 | } | ||
1165 | |||
1166 | #define BUCKET_GC_GEN_MAX 96U | ||
1167 | #define BUCKET_DISK_GEN_MAX 64U | ||
1168 | |||
1169 | #define kobj_attribute_write(n, fn) \ | ||
1170 | static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) | ||
1171 | |||
1172 | #define kobj_attribute_rw(n, show, store) \ | ||
1173 | static struct kobj_attribute ksysfs_##n = \ | ||
1174 | __ATTR(n, S_IWUSR|S_IRUSR, show, store) | ||
1175 | |||
1176 | /* Forward declarations */ | ||
1177 | |||
1178 | void bch_writeback_queue(struct cached_dev *); | ||
1179 | void bch_writeback_add(struct cached_dev *, unsigned); | ||
1180 | |||
1181 | void bch_count_io_errors(struct cache *, int, const char *); | ||
1182 | void bch_bbio_count_io_errors(struct cache_set *, struct bio *, | ||
1183 | int, const char *); | ||
1184 | void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *); | ||
1185 | void bch_bbio_free(struct bio *, struct cache_set *); | ||
1186 | struct bio *bch_bbio_alloc(struct cache_set *); | ||
1187 | |||
1188 | struct bio *bch_bio_split(struct bio *, int, gfp_t, struct bio_set *); | ||
1189 | void bch_generic_make_request(struct bio *, struct bio_split_pool *); | ||
1190 | void __bch_submit_bbio(struct bio *, struct cache_set *); | ||
1191 | void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); | ||
1192 | |||
1193 | uint8_t bch_inc_gen(struct cache *, struct bucket *); | ||
1194 | void bch_rescale_priorities(struct cache_set *, int); | ||
1195 | bool bch_bucket_add_unused(struct cache *, struct bucket *); | ||
1196 | void bch_allocator_thread(struct closure *); | ||
1197 | |||
1198 | long bch_bucket_alloc(struct cache *, unsigned, struct closure *); | ||
1199 | void bch_bucket_free(struct cache_set *, struct bkey *); | ||
1200 | |||
1201 | int __bch_bucket_alloc_set(struct cache_set *, unsigned, | ||
1202 | struct bkey *, int, struct closure *); | ||
1203 | int bch_bucket_alloc_set(struct cache_set *, unsigned, | ||
1204 | struct bkey *, int, struct closure *); | ||
1205 | |||
1206 | __printf(2, 3) | ||
1207 | bool bch_cache_set_error(struct cache_set *, const char *, ...); | ||
1208 | |||
1209 | void bch_prio_write(struct cache *); | ||
1210 | void bch_write_bdev_super(struct cached_dev *, struct closure *); | ||
1211 | |||
1212 | extern struct workqueue_struct *bcache_wq, *bch_gc_wq; | ||
1213 | extern const char * const bch_cache_modes[]; | ||
1214 | extern struct mutex bch_register_lock; | ||
1215 | extern struct list_head bch_cache_sets; | ||
1216 | |||
1217 | extern struct kobj_type bch_cached_dev_ktype; | ||
1218 | extern struct kobj_type bch_flash_dev_ktype; | ||
1219 | extern struct kobj_type bch_cache_set_ktype; | ||
1220 | extern struct kobj_type bch_cache_set_internal_ktype; | ||
1221 | extern struct kobj_type bch_cache_ktype; | ||
1222 | |||
1223 | void bch_cached_dev_release(struct kobject *); | ||
1224 | void bch_flash_dev_release(struct kobject *); | ||
1225 | void bch_cache_set_release(struct kobject *); | ||
1226 | void bch_cache_release(struct kobject *); | ||
1227 | |||
1228 | int bch_uuid_write(struct cache_set *); | ||
1229 | void bcache_write_super(struct cache_set *); | ||
1230 | |||
1231 | int bch_flash_dev_create(struct cache_set *c, uint64_t size); | ||
1232 | |||
1233 | int bch_cached_dev_attach(struct cached_dev *, struct cache_set *); | ||
1234 | void bch_cached_dev_detach(struct cached_dev *); | ||
1235 | void bch_cached_dev_run(struct cached_dev *); | ||
1236 | void bcache_device_stop(struct bcache_device *); | ||
1237 | |||
1238 | void bch_cache_set_unregister(struct cache_set *); | ||
1239 | void bch_cache_set_stop(struct cache_set *); | ||
1240 | |||
1241 | struct cache_set *bch_cache_set_alloc(struct cache_sb *); | ||
1242 | void bch_btree_cache_free(struct cache_set *); | ||
1243 | int bch_btree_cache_alloc(struct cache_set *); | ||
1244 | void bch_writeback_init_cached_dev(struct cached_dev *); | ||
1245 | void bch_moving_init_cache_set(struct cache_set *); | ||
1246 | |||
1247 | void bch_cache_allocator_exit(struct cache *ca); | ||
1248 | int bch_cache_allocator_init(struct cache *ca); | ||
1249 | |||
1250 | void bch_debug_exit(void); | ||
1251 | int bch_debug_init(struct kobject *); | ||
1252 | void bch_writeback_exit(void); | ||
1253 | int bch_writeback_init(void); | ||
1254 | void bch_request_exit(void); | ||
1255 | int bch_request_init(void); | ||
1256 | void bch_btree_exit(void); | ||
1257 | int bch_btree_init(void); | ||
1258 | |||
1259 | #endif /* _BCACHE_H */ | ||
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c new file mode 100644 index 000000000000..cb4578a327b9 --- /dev/null +++ b/drivers/md/bcache/bset.c | |||
@@ -0,0 +1,1192 @@ | |||
1 | /* | ||
2 | * Code for working with individual keys, and sorted sets of keys with in a | ||
3 | * btree node | ||
4 | * | ||
5 | * Copyright 2012 Google, Inc. | ||
6 | */ | ||
7 | |||
8 | #include "bcache.h" | ||
9 | #include "btree.h" | ||
10 | #include "debug.h" | ||
11 | |||
12 | #include <linux/random.h> | ||
13 | #include <linux/prefetch.h> | ||
14 | |||
15 | /* Keylists */ | ||
16 | |||
17 | void bch_keylist_copy(struct keylist *dest, struct keylist *src) | ||
18 | { | ||
19 | *dest = *src; | ||
20 | |||
21 | if (src->list == src->d) { | ||
22 | size_t n = (uint64_t *) src->top - src->d; | ||
23 | dest->top = (struct bkey *) &dest->d[n]; | ||
24 | dest->list = dest->d; | ||
25 | } | ||
26 | } | ||
27 | |||
28 | int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) | ||
29 | { | ||
30 | unsigned oldsize = (uint64_t *) l->top - l->list; | ||
31 | unsigned newsize = oldsize + 2 + nptrs; | ||
32 | uint64_t *new; | ||
33 | |||
34 | /* The journalling code doesn't handle the case where the keys to insert | ||
35 | * is bigger than an empty write: If we just return -ENOMEM here, | ||
36 | * bio_insert() and bio_invalidate() will insert the keys created so far | ||
37 | * and finish the rest when the keylist is empty. | ||
38 | */ | ||
39 | if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) | ||
40 | return -ENOMEM; | ||
41 | |||
42 | newsize = roundup_pow_of_two(newsize); | ||
43 | |||
44 | if (newsize <= KEYLIST_INLINE || | ||
45 | roundup_pow_of_two(oldsize) == newsize) | ||
46 | return 0; | ||
47 | |||
48 | new = krealloc(l->list == l->d ? NULL : l->list, | ||
49 | sizeof(uint64_t) * newsize, GFP_NOIO); | ||
50 | |||
51 | if (!new) | ||
52 | return -ENOMEM; | ||
53 | |||
54 | if (l->list == l->d) | ||
55 | memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE); | ||
56 | |||
57 | l->list = new; | ||
58 | l->top = (struct bkey *) (&l->list[oldsize]); | ||
59 | |||
60 | return 0; | ||
61 | } | ||
62 | |||
63 | struct bkey *bch_keylist_pop(struct keylist *l) | ||
64 | { | ||
65 | struct bkey *k = l->bottom; | ||
66 | |||
67 | if (k == l->top) | ||
68 | return NULL; | ||
69 | |||
70 | while (bkey_next(k) != l->top) | ||
71 | k = bkey_next(k); | ||
72 | |||
73 | return l->top = k; | ||
74 | } | ||
75 | |||
76 | /* Pointer validation */ | ||
77 | |||
78 | bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) | ||
79 | { | ||
80 | unsigned i; | ||
81 | |||
82 | if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) | ||
83 | goto bad; | ||
84 | |||
85 | if (!level && KEY_SIZE(k) > KEY_OFFSET(k)) | ||
86 | goto bad; | ||
87 | |||
88 | if (!KEY_SIZE(k)) | ||
89 | return true; | ||
90 | |||
91 | for (i = 0; i < KEY_PTRS(k); i++) | ||
92 | if (ptr_available(c, k, i)) { | ||
93 | struct cache *ca = PTR_CACHE(c, k, i); | ||
94 | size_t bucket = PTR_BUCKET_NR(c, k, i); | ||
95 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | ||
96 | |||
97 | if (KEY_SIZE(k) + r > c->sb.bucket_size || | ||
98 | bucket < ca->sb.first_bucket || | ||
99 | bucket >= ca->sb.nbuckets) | ||
100 | goto bad; | ||
101 | } | ||
102 | |||
103 | return false; | ||
104 | bad: | ||
105 | cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k)); | ||
106 | return true; | ||
107 | } | ||
108 | |||
109 | bool bch_ptr_bad(struct btree *b, const struct bkey *k) | ||
110 | { | ||
111 | struct bucket *g; | ||
112 | unsigned i, stale; | ||
113 | |||
114 | if (!bkey_cmp(k, &ZERO_KEY) || | ||
115 | !KEY_PTRS(k) || | ||
116 | bch_ptr_invalid(b, k)) | ||
117 | return true; | ||
118 | |||
119 | if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV) | ||
120 | return true; | ||
121 | |||
122 | for (i = 0; i < KEY_PTRS(k); i++) | ||
123 | if (ptr_available(b->c, k, i)) { | ||
124 | g = PTR_BUCKET(b->c, k, i); | ||
125 | stale = ptr_stale(b->c, k, i); | ||
126 | |||
127 | btree_bug_on(stale > 96, b, | ||
128 | "key too stale: %i, need_gc %u", | ||
129 | stale, b->c->need_gc); | ||
130 | |||
131 | btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), | ||
132 | b, "stale dirty pointer"); | ||
133 | |||
134 | if (stale) | ||
135 | return true; | ||
136 | |||
137 | #ifdef CONFIG_BCACHE_EDEBUG | ||
138 | if (!mutex_trylock(&b->c->bucket_lock)) | ||
139 | continue; | ||
140 | |||
141 | if (b->level) { | ||
142 | if (KEY_DIRTY(k) || | ||
143 | g->prio != BTREE_PRIO || | ||
144 | (b->c->gc_mark_valid && | ||
145 | GC_MARK(g) != GC_MARK_METADATA)) | ||
146 | goto bug; | ||
147 | |||
148 | } else { | ||
149 | if (g->prio == BTREE_PRIO) | ||
150 | goto bug; | ||
151 | |||
152 | if (KEY_DIRTY(k) && | ||
153 | b->c->gc_mark_valid && | ||
154 | GC_MARK(g) != GC_MARK_DIRTY) | ||
155 | goto bug; | ||
156 | } | ||
157 | mutex_unlock(&b->c->bucket_lock); | ||
158 | #endif | ||
159 | } | ||
160 | |||
161 | return false; | ||
162 | #ifdef CONFIG_BCACHE_EDEBUG | ||
163 | bug: | ||
164 | mutex_unlock(&b->c->bucket_lock); | ||
165 | btree_bug(b, | ||
166 | "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | ||
167 | pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), | ||
168 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | ||
169 | return true; | ||
170 | #endif | ||
171 | } | ||
172 | |||
173 | /* Key/pointer manipulation */ | ||
174 | |||
175 | void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, | ||
176 | unsigned i) | ||
177 | { | ||
178 | BUG_ON(i > KEY_PTRS(src)); | ||
179 | |||
180 | /* Only copy the header, key, and one pointer. */ | ||
181 | memcpy(dest, src, 2 * sizeof(uint64_t)); | ||
182 | dest->ptr[0] = src->ptr[i]; | ||
183 | SET_KEY_PTRS(dest, 1); | ||
184 | /* We didn't copy the checksum so clear that bit. */ | ||
185 | SET_KEY_CSUM(dest, 0); | ||
186 | } | ||
187 | |||
188 | bool __bch_cut_front(const struct bkey *where, struct bkey *k) | ||
189 | { | ||
190 | unsigned i, len = 0; | ||
191 | |||
192 | if (bkey_cmp(where, &START_KEY(k)) <= 0) | ||
193 | return false; | ||
194 | |||
195 | if (bkey_cmp(where, k) < 0) | ||
196 | len = KEY_OFFSET(k) - KEY_OFFSET(where); | ||
197 | else | ||
198 | bkey_copy_key(k, where); | ||
199 | |||
200 | for (i = 0; i < KEY_PTRS(k); i++) | ||
201 | SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + KEY_SIZE(k) - len); | ||
202 | |||
203 | BUG_ON(len > KEY_SIZE(k)); | ||
204 | SET_KEY_SIZE(k, len); | ||
205 | return true; | ||
206 | } | ||
207 | |||
208 | bool __bch_cut_back(const struct bkey *where, struct bkey *k) | ||
209 | { | ||
210 | unsigned len = 0; | ||
211 | |||
212 | if (bkey_cmp(where, k) >= 0) | ||
213 | return false; | ||
214 | |||
215 | BUG_ON(KEY_INODE(where) != KEY_INODE(k)); | ||
216 | |||
217 | if (bkey_cmp(where, &START_KEY(k)) > 0) | ||
218 | len = KEY_OFFSET(where) - KEY_START(k); | ||
219 | |||
220 | bkey_copy_key(k, where); | ||
221 | |||
222 | BUG_ON(len > KEY_SIZE(k)); | ||
223 | SET_KEY_SIZE(k, len); | ||
224 | return true; | ||
225 | } | ||
226 | |||
227 | static uint64_t merge_chksums(struct bkey *l, struct bkey *r) | ||
228 | { | ||
229 | return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & | ||
230 | ~((uint64_t)1 << 63); | ||
231 | } | ||
232 | |||
233 | /* Tries to merge l and r: l should be lower than r | ||
234 | * Returns true if we were able to merge. If we did merge, l will be the merged | ||
235 | * key, r will be untouched. | ||
236 | */ | ||
237 | bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r) | ||
238 | { | ||
239 | unsigned i; | ||
240 | |||
241 | if (key_merging_disabled(b->c)) | ||
242 | return false; | ||
243 | |||
244 | if (KEY_PTRS(l) != KEY_PTRS(r) || | ||
245 | KEY_DIRTY(l) != KEY_DIRTY(r) || | ||
246 | bkey_cmp(l, &START_KEY(r))) | ||
247 | return false; | ||
248 | |||
249 | for (i = 0; i < KEY_PTRS(l); i++) | ||
250 | if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || | ||
251 | PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) | ||
252 | return false; | ||
253 | |||
254 | /* Keys with no pointers aren't restricted to one bucket and could | ||
255 | * overflow KEY_SIZE | ||
256 | */ | ||
257 | if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { | ||
258 | SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); | ||
259 | SET_KEY_SIZE(l, USHRT_MAX); | ||
260 | |||
261 | bch_cut_front(l, r); | ||
262 | return false; | ||
263 | } | ||
264 | |||
265 | if (KEY_CSUM(l)) { | ||
266 | if (KEY_CSUM(r)) | ||
267 | l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); | ||
268 | else | ||
269 | SET_KEY_CSUM(l, 0); | ||
270 | } | ||
271 | |||
272 | SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); | ||
273 | SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); | ||
274 | |||
275 | return true; | ||
276 | } | ||
277 | |||
278 | /* Binary tree stuff for auxiliary search trees */ | ||
279 | |||
280 | static unsigned inorder_next(unsigned j, unsigned size) | ||
281 | { | ||
282 | if (j * 2 + 1 < size) { | ||
283 | j = j * 2 + 1; | ||
284 | |||
285 | while (j * 2 < size) | ||
286 | j *= 2; | ||
287 | } else | ||
288 | j >>= ffz(j) + 1; | ||
289 | |||
290 | return j; | ||
291 | } | ||
292 | |||
293 | static unsigned inorder_prev(unsigned j, unsigned size) | ||
294 | { | ||
295 | if (j * 2 < size) { | ||
296 | j = j * 2; | ||
297 | |||
298 | while (j * 2 + 1 < size) | ||
299 | j = j * 2 + 1; | ||
300 | } else | ||
301 | j >>= ffs(j); | ||
302 | |||
303 | return j; | ||
304 | } | ||
305 | |||
306 | /* I have no idea why this code works... and I'm the one who wrote it | ||
307 | * | ||
308 | * However, I do know what it does: | ||
309 | * Given a binary tree constructed in an array (i.e. how you normally implement | ||
310 | * a heap), it converts a node in the tree - referenced by array index - to the | ||
311 | * index it would have if you did an inorder traversal. | ||
312 | * | ||
313 | * Also tested for every j, size up to size somewhere around 6 million. | ||
314 | * | ||
315 | * The binary tree starts at array index 1, not 0 | ||
316 | * extra is a function of size: | ||
317 | * extra = (size - rounddown_pow_of_two(size - 1)) << 1; | ||
318 | */ | ||
319 | static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra) | ||
320 | { | ||
321 | unsigned b = fls(j); | ||
322 | unsigned shift = fls(size - 1) - b; | ||
323 | |||
324 | j ^= 1U << (b - 1); | ||
325 | j <<= 1; | ||
326 | j |= 1; | ||
327 | j <<= shift; | ||
328 | |||
329 | if (j > extra) | ||
330 | j -= (j - extra) >> 1; | ||
331 | |||
332 | return j; | ||
333 | } | ||
334 | |||
335 | static unsigned to_inorder(unsigned j, struct bset_tree *t) | ||
336 | { | ||
337 | return __to_inorder(j, t->size, t->extra); | ||
338 | } | ||
339 | |||
340 | static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra) | ||
341 | { | ||
342 | unsigned shift; | ||
343 | |||
344 | if (j > extra) | ||
345 | j += j - extra; | ||
346 | |||
347 | shift = ffs(j); | ||
348 | |||
349 | j >>= shift; | ||
350 | j |= roundup_pow_of_two(size) >> shift; | ||
351 | |||
352 | return j; | ||
353 | } | ||
354 | |||
355 | static unsigned inorder_to_tree(unsigned j, struct bset_tree *t) | ||
356 | { | ||
357 | return __inorder_to_tree(j, t->size, t->extra); | ||
358 | } | ||
359 | |||
360 | #if 0 | ||
361 | void inorder_test(void) | ||
362 | { | ||
363 | unsigned long done = 0; | ||
364 | ktime_t start = ktime_get(); | ||
365 | |||
366 | for (unsigned size = 2; | ||
367 | size < 65536000; | ||
368 | size++) { | ||
369 | unsigned extra = (size - rounddown_pow_of_two(size - 1)) << 1; | ||
370 | unsigned i = 1, j = rounddown_pow_of_two(size - 1); | ||
371 | |||
372 | if (!(size % 4096)) | ||
373 | printk(KERN_NOTICE "loop %u, %llu per us\n", size, | ||
374 | done / ktime_us_delta(ktime_get(), start)); | ||
375 | |||
376 | while (1) { | ||
377 | if (__inorder_to_tree(i, size, extra) != j) | ||
378 | panic("size %10u j %10u i %10u", size, j, i); | ||
379 | |||
380 | if (__to_inorder(j, size, extra) != i) | ||
381 | panic("size %10u j %10u i %10u", size, j, i); | ||
382 | |||
383 | if (j == rounddown_pow_of_two(size) - 1) | ||
384 | break; | ||
385 | |||
386 | BUG_ON(inorder_prev(inorder_next(j, size), size) != j); | ||
387 | |||
388 | j = inorder_next(j, size); | ||
389 | i++; | ||
390 | } | ||
391 | |||
392 | done += size - 1; | ||
393 | } | ||
394 | } | ||
395 | #endif | ||
396 | |||
397 | /* | ||
398 | * Cacheline/offset <-> bkey pointer arithmatic: | ||
399 | * | ||
400 | * t->tree is a binary search tree in an array; each node corresponds to a key | ||
401 | * in one cacheline in t->set (BSET_CACHELINE bytes). | ||
402 | * | ||
403 | * This means we don't have to store the full index of the key that a node in | ||
404 | * the binary tree points to; to_inorder() gives us the cacheline, and then | ||
405 | * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes. | ||
406 | * | ||
407 | * cacheline_to_bkey() and friends abstract out all the pointer arithmatic to | ||
408 | * make this work. | ||
409 | * | ||
410 | * To construct the bfloat for an arbitrary key we need to know what the key | ||
411 | * immediately preceding it is: we have to check if the two keys differ in the | ||
412 | * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size | ||
413 | * of the previous key so we can walk backwards to it from t->tree[j]'s key. | ||
414 | */ | ||
415 | |||
416 | static struct bkey *cacheline_to_bkey(struct bset_tree *t, unsigned cacheline, | ||
417 | unsigned offset) | ||
418 | { | ||
419 | return ((void *) t->data) + cacheline * BSET_CACHELINE + offset * 8; | ||
420 | } | ||
421 | |||
422 | static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k) | ||
423 | { | ||
424 | return ((void *) k - (void *) t->data) / BSET_CACHELINE; | ||
425 | } | ||
426 | |||
427 | static unsigned bkey_to_cacheline_offset(struct bkey *k) | ||
428 | { | ||
429 | return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t); | ||
430 | } | ||
431 | |||
432 | static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) | ||
433 | { | ||
434 | return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m); | ||
435 | } | ||
436 | |||
437 | static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j) | ||
438 | { | ||
439 | return (void *) (((uint64_t *) tree_to_bkey(t, j)) - t->prev[j]); | ||
440 | } | ||
441 | |||
442 | /* | ||
443 | * For the write set - the one we're currently inserting keys into - we don't | ||
444 | * maintain a full search tree, we just keep a simple lookup table in t->prev. | ||
445 | */ | ||
446 | static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline) | ||
447 | { | ||
448 | return cacheline_to_bkey(t, cacheline, t->prev[cacheline]); | ||
449 | } | ||
450 | |||
451 | static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) | ||
452 | { | ||
453 | #ifdef CONFIG_X86_64 | ||
454 | asm("shrd %[shift],%[high],%[low]" | ||
455 | : [low] "+Rm" (low) | ||
456 | : [high] "R" (high), | ||
457 | [shift] "ci" (shift) | ||
458 | : "cc"); | ||
459 | #else | ||
460 | low >>= shift; | ||
461 | low |= (high << 1) << (63U - shift); | ||
462 | #endif | ||
463 | return low; | ||
464 | } | ||
465 | |||
466 | static inline unsigned bfloat_mantissa(const struct bkey *k, | ||
467 | struct bkey_float *f) | ||
468 | { | ||
469 | const uint64_t *p = &k->low - (f->exponent >> 6); | ||
470 | return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK; | ||
471 | } | ||
472 | |||
473 | static void make_bfloat(struct bset_tree *t, unsigned j) | ||
474 | { | ||
475 | struct bkey_float *f = &t->tree[j]; | ||
476 | struct bkey *m = tree_to_bkey(t, j); | ||
477 | struct bkey *p = tree_to_prev_bkey(t, j); | ||
478 | |||
479 | struct bkey *l = is_power_of_2(j) | ||
480 | ? t->data->start | ||
481 | : tree_to_prev_bkey(t, j >> ffs(j)); | ||
482 | |||
483 | struct bkey *r = is_power_of_2(j + 1) | ||
484 | ? node(t->data, t->data->keys - bkey_u64s(&t->end)) | ||
485 | : tree_to_bkey(t, j >> (ffz(j) + 1)); | ||
486 | |||
487 | BUG_ON(m < l || m > r); | ||
488 | BUG_ON(bkey_next(p) != m); | ||
489 | |||
490 | if (KEY_INODE(l) != KEY_INODE(r)) | ||
491 | f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64; | ||
492 | else | ||
493 | f->exponent = fls64(r->low ^ l->low); | ||
494 | |||
495 | f->exponent = max_t(int, f->exponent - BKEY_MANTISSA_BITS, 0); | ||
496 | |||
497 | /* | ||
498 | * Setting f->exponent = 127 flags this node as failed, and causes the | ||
499 | * lookup code to fall back to comparing against the original key. | ||
500 | */ | ||
501 | |||
502 | if (bfloat_mantissa(m, f) != bfloat_mantissa(p, f)) | ||
503 | f->mantissa = bfloat_mantissa(m, f) - 1; | ||
504 | else | ||
505 | f->exponent = 127; | ||
506 | } | ||
507 | |||
508 | static void bset_alloc_tree(struct btree *b, struct bset_tree *t) | ||
509 | { | ||
510 | if (t != b->sets) { | ||
511 | unsigned j = roundup(t[-1].size, | ||
512 | 64 / sizeof(struct bkey_float)); | ||
513 | |||
514 | t->tree = t[-1].tree + j; | ||
515 | t->prev = t[-1].prev + j; | ||
516 | } | ||
517 | |||
518 | while (t < b->sets + MAX_BSETS) | ||
519 | t++->size = 0; | ||
520 | } | ||
521 | |||
522 | static void bset_build_unwritten_tree(struct btree *b) | ||
523 | { | ||
524 | struct bset_tree *t = b->sets + b->nsets; | ||
525 | |||
526 | bset_alloc_tree(b, t); | ||
527 | |||
528 | if (t->tree != b->sets->tree + bset_tree_space(b)) { | ||
529 | t->prev[0] = bkey_to_cacheline_offset(t->data->start); | ||
530 | t->size = 1; | ||
531 | } | ||
532 | } | ||
533 | |||
534 | static void bset_build_written_tree(struct btree *b) | ||
535 | { | ||
536 | struct bset_tree *t = b->sets + b->nsets; | ||
537 | struct bkey *k = t->data->start; | ||
538 | unsigned j, cacheline = 1; | ||
539 | |||
540 | bset_alloc_tree(b, t); | ||
541 | |||
542 | t->size = min_t(unsigned, | ||
543 | bkey_to_cacheline(t, end(t->data)), | ||
544 | b->sets->tree + bset_tree_space(b) - t->tree); | ||
545 | |||
546 | if (t->size < 2) { | ||
547 | t->size = 0; | ||
548 | return; | ||
549 | } | ||
550 | |||
551 | t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; | ||
552 | |||
553 | /* First we figure out where the first key in each cacheline is */ | ||
554 | for (j = inorder_next(0, t->size); | ||
555 | j; | ||
556 | j = inorder_next(j, t->size)) { | ||
557 | while (bkey_to_cacheline(t, k) != cacheline) | ||
558 | k = bkey_next(k); | ||
559 | |||
560 | t->prev[j] = bkey_u64s(k); | ||
561 | k = bkey_next(k); | ||
562 | cacheline++; | ||
563 | t->tree[j].m = bkey_to_cacheline_offset(k); | ||
564 | } | ||
565 | |||
566 | while (bkey_next(k) != end(t->data)) | ||
567 | k = bkey_next(k); | ||
568 | |||
569 | t->end = *k; | ||
570 | |||
571 | /* Then we build the tree */ | ||
572 | for (j = inorder_next(0, t->size); | ||
573 | j; | ||
574 | j = inorder_next(j, t->size)) | ||
575 | make_bfloat(t, j); | ||
576 | } | ||
577 | |||
578 | void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k) | ||
579 | { | ||
580 | struct bset_tree *t; | ||
581 | unsigned inorder, j = 1; | ||
582 | |||
583 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) | ||
584 | if (k < end(t->data)) | ||
585 | goto found_set; | ||
586 | |||
587 | BUG(); | ||
588 | found_set: | ||
589 | if (!t->size || !bset_written(b, t)) | ||
590 | return; | ||
591 | |||
592 | inorder = bkey_to_cacheline(t, k); | ||
593 | |||
594 | if (k == t->data->start) | ||
595 | goto fix_left; | ||
596 | |||
597 | if (bkey_next(k) == end(t->data)) { | ||
598 | t->end = *k; | ||
599 | goto fix_right; | ||
600 | } | ||
601 | |||
602 | j = inorder_to_tree(inorder, t); | ||
603 | |||
604 | if (j && | ||
605 | j < t->size && | ||
606 | k == tree_to_bkey(t, j)) | ||
607 | fix_left: do { | ||
608 | make_bfloat(t, j); | ||
609 | j = j * 2; | ||
610 | } while (j < t->size); | ||
611 | |||
612 | j = inorder_to_tree(inorder + 1, t); | ||
613 | |||
614 | if (j && | ||
615 | j < t->size && | ||
616 | k == tree_to_prev_bkey(t, j)) | ||
617 | fix_right: do { | ||
618 | make_bfloat(t, j); | ||
619 | j = j * 2 + 1; | ||
620 | } while (j < t->size); | ||
621 | } | ||
622 | |||
623 | void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) | ||
624 | { | ||
625 | struct bset_tree *t = &b->sets[b->nsets]; | ||
626 | unsigned shift = bkey_u64s(k); | ||
627 | unsigned j = bkey_to_cacheline(t, k); | ||
628 | |||
629 | /* We're getting called from btree_split() or btree_gc, just bail out */ | ||
630 | if (!t->size) | ||
631 | return; | ||
632 | |||
633 | /* k is the key we just inserted; we need to find the entry in the | ||
634 | * lookup table for the first key that is strictly greater than k: | ||
635 | * it's either k's cacheline or the next one | ||
636 | */ | ||
637 | if (j < t->size && | ||
638 | table_to_bkey(t, j) <= k) | ||
639 | j++; | ||
640 | |||
641 | /* Adjust all the lookup table entries, and find a new key for any that | ||
642 | * have gotten too big | ||
643 | */ | ||
644 | for (; j < t->size; j++) { | ||
645 | t->prev[j] += shift; | ||
646 | |||
647 | if (t->prev[j] > 7) { | ||
648 | k = table_to_bkey(t, j - 1); | ||
649 | |||
650 | while (k < cacheline_to_bkey(t, j, 0)) | ||
651 | k = bkey_next(k); | ||
652 | |||
653 | t->prev[j] = bkey_to_cacheline_offset(k); | ||
654 | } | ||
655 | } | ||
656 | |||
657 | if (t->size == b->sets->tree + bset_tree_space(b) - t->tree) | ||
658 | return; | ||
659 | |||
660 | /* Possibly add a new entry to the end of the lookup table */ | ||
661 | |||
662 | for (k = table_to_bkey(t, t->size - 1); | ||
663 | k != end(t->data); | ||
664 | k = bkey_next(k)) | ||
665 | if (t->size == bkey_to_cacheline(t, k)) { | ||
666 | t->prev[t->size] = bkey_to_cacheline_offset(k); | ||
667 | t->size++; | ||
668 | } | ||
669 | } | ||
670 | |||
671 | void bch_bset_init_next(struct btree *b) | ||
672 | { | ||
673 | struct bset *i = write_block(b); | ||
674 | |||
675 | if (i != b->sets[0].data) { | ||
676 | b->sets[++b->nsets].data = i; | ||
677 | i->seq = b->sets[0].data->seq; | ||
678 | } else | ||
679 | get_random_bytes(&i->seq, sizeof(uint64_t)); | ||
680 | |||
681 | i->magic = bset_magic(b->c); | ||
682 | i->version = 0; | ||
683 | i->keys = 0; | ||
684 | |||
685 | bset_build_unwritten_tree(b); | ||
686 | } | ||
687 | |||
688 | struct bset_search_iter { | ||
689 | struct bkey *l, *r; | ||
690 | }; | ||
691 | |||
692 | static struct bset_search_iter bset_search_write_set(struct btree *b, | ||
693 | struct bset_tree *t, | ||
694 | const struct bkey *search) | ||
695 | { | ||
696 | unsigned li = 0, ri = t->size; | ||
697 | |||
698 | BUG_ON(!b->nsets && | ||
699 | t->size < bkey_to_cacheline(t, end(t->data))); | ||
700 | |||
701 | while (li + 1 != ri) { | ||
702 | unsigned m = (li + ri) >> 1; | ||
703 | |||
704 | if (bkey_cmp(table_to_bkey(t, m), search) > 0) | ||
705 | ri = m; | ||
706 | else | ||
707 | li = m; | ||
708 | } | ||
709 | |||
710 | return (struct bset_search_iter) { | ||
711 | table_to_bkey(t, li), | ||
712 | ri < t->size ? table_to_bkey(t, ri) : end(t->data) | ||
713 | }; | ||
714 | } | ||
715 | |||
716 | static struct bset_search_iter bset_search_tree(struct btree *b, | ||
717 | struct bset_tree *t, | ||
718 | const struct bkey *search) | ||
719 | { | ||
720 | struct bkey *l, *r; | ||
721 | struct bkey_float *f; | ||
722 | unsigned inorder, j, n = 1; | ||
723 | |||
724 | do { | ||
725 | unsigned p = n << 4; | ||
726 | p &= ((int) (p - t->size)) >> 31; | ||
727 | |||
728 | prefetch(&t->tree[p]); | ||
729 | |||
730 | j = n; | ||
731 | f = &t->tree[j]; | ||
732 | |||
733 | /* | ||
734 | * n = (f->mantissa > bfloat_mantissa()) | ||
735 | * ? j * 2 | ||
736 | * : j * 2 + 1; | ||
737 | * | ||
738 | * We need to subtract 1 from f->mantissa for the sign bit trick | ||
739 | * to work - that's done in make_bfloat() | ||
740 | */ | ||
741 | if (likely(f->exponent != 127)) | ||
742 | n = j * 2 + (((unsigned) | ||
743 | (f->mantissa - | ||
744 | bfloat_mantissa(search, f))) >> 31); | ||
745 | else | ||
746 | n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) | ||
747 | ? j * 2 | ||
748 | : j * 2 + 1; | ||
749 | } while (n < t->size); | ||
750 | |||
751 | inorder = to_inorder(j, t); | ||
752 | |||
753 | /* | ||
754 | * n would have been the node we recursed to - the low bit tells us if | ||
755 | * we recursed left or recursed right. | ||
756 | */ | ||
757 | if (n & 1) { | ||
758 | l = cacheline_to_bkey(t, inorder, f->m); | ||
759 | |||
760 | if (++inorder != t->size) { | ||
761 | f = &t->tree[inorder_next(j, t->size)]; | ||
762 | r = cacheline_to_bkey(t, inorder, f->m); | ||
763 | } else | ||
764 | r = end(t->data); | ||
765 | } else { | ||
766 | r = cacheline_to_bkey(t, inorder, f->m); | ||
767 | |||
768 | if (--inorder) { | ||
769 | f = &t->tree[inorder_prev(j, t->size)]; | ||
770 | l = cacheline_to_bkey(t, inorder, f->m); | ||
771 | } else | ||
772 | l = t->data->start; | ||
773 | } | ||
774 | |||
775 | return (struct bset_search_iter) {l, r}; | ||
776 | } | ||
777 | |||
778 | struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, | ||
779 | const struct bkey *search) | ||
780 | { | ||
781 | struct bset_search_iter i; | ||
782 | |||
783 | /* | ||
784 | * First, we search for a cacheline, then lastly we do a linear search | ||
785 | * within that cacheline. | ||
786 | * | ||
787 | * To search for the cacheline, there's three different possibilities: | ||
788 | * * The set is too small to have a search tree, so we just do a linear | ||
789 | * search over the whole set. | ||
790 | * * The set is the one we're currently inserting into; keeping a full | ||
791 | * auxiliary search tree up to date would be too expensive, so we | ||
792 | * use a much simpler lookup table to do a binary search - | ||
793 | * bset_search_write_set(). | ||
794 | * * Or we use the auxiliary search tree we constructed earlier - | ||
795 | * bset_search_tree() | ||
796 | */ | ||
797 | |||
798 | if (unlikely(!t->size)) { | ||
799 | i.l = t->data->start; | ||
800 | i.r = end(t->data); | ||
801 | } else if (bset_written(b, t)) { | ||
802 | /* | ||
803 | * Each node in the auxiliary search tree covers a certain range | ||
804 | * of bits, and keys above and below the set it covers might | ||
805 | * differ outside those bits - so we have to special case the | ||
806 | * start and end - handle that here: | ||
807 | */ | ||
808 | |||
809 | if (unlikely(bkey_cmp(search, &t->end) >= 0)) | ||
810 | return end(t->data); | ||
811 | |||
812 | if (unlikely(bkey_cmp(search, t->data->start) < 0)) | ||
813 | return t->data->start; | ||
814 | |||
815 | i = bset_search_tree(b, t, search); | ||
816 | } else | ||
817 | i = bset_search_write_set(b, t, search); | ||
818 | |||
819 | #ifdef CONFIG_BCACHE_EDEBUG | ||
820 | BUG_ON(bset_written(b, t) && | ||
821 | i.l != t->data->start && | ||
822 | bkey_cmp(tree_to_prev_bkey(t, | ||
823 | inorder_to_tree(bkey_to_cacheline(t, i.l), t)), | ||
824 | search) > 0); | ||
825 | |||
826 | BUG_ON(i.r != end(t->data) && | ||
827 | bkey_cmp(i.r, search) <= 0); | ||
828 | #endif | ||
829 | |||
830 | while (likely(i.l != i.r) && | ||
831 | bkey_cmp(i.l, search) <= 0) | ||
832 | i.l = bkey_next(i.l); | ||
833 | |||
834 | return i.l; | ||
835 | } | ||
836 | |||
837 | /* Btree iterator */ | ||
838 | |||
839 | static inline bool btree_iter_cmp(struct btree_iter_set l, | ||
840 | struct btree_iter_set r) | ||
841 | { | ||
842 | int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); | ||
843 | |||
844 | return c ? c > 0 : l.k < r.k; | ||
845 | } | ||
846 | |||
847 | static inline bool btree_iter_end(struct btree_iter *iter) | ||
848 | { | ||
849 | return !iter->used; | ||
850 | } | ||
851 | |||
852 | void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, | ||
853 | struct bkey *end) | ||
854 | { | ||
855 | if (k != end) | ||
856 | BUG_ON(!heap_add(iter, | ||
857 | ((struct btree_iter_set) { k, end }), | ||
858 | btree_iter_cmp)); | ||
859 | } | ||
860 | |||
861 | struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, | ||
862 | struct bkey *search, struct bset_tree *start) | ||
863 | { | ||
864 | struct bkey *ret = NULL; | ||
865 | iter->size = ARRAY_SIZE(iter->data); | ||
866 | iter->used = 0; | ||
867 | |||
868 | for (; start <= &b->sets[b->nsets]; start++) { | ||
869 | ret = bch_bset_search(b, start, search); | ||
870 | bch_btree_iter_push(iter, ret, end(start->data)); | ||
871 | } | ||
872 | |||
873 | return ret; | ||
874 | } | ||
875 | |||
876 | struct bkey *bch_btree_iter_next(struct btree_iter *iter) | ||
877 | { | ||
878 | struct btree_iter_set unused; | ||
879 | struct bkey *ret = NULL; | ||
880 | |||
881 | if (!btree_iter_end(iter)) { | ||
882 | ret = iter->data->k; | ||
883 | iter->data->k = bkey_next(iter->data->k); | ||
884 | |||
885 | if (iter->data->k > iter->data->end) { | ||
886 | WARN_ONCE(1, "bset was corrupt!\n"); | ||
887 | iter->data->k = iter->data->end; | ||
888 | } | ||
889 | |||
890 | if (iter->data->k == iter->data->end) | ||
891 | heap_pop(iter, unused, btree_iter_cmp); | ||
892 | else | ||
893 | heap_sift(iter, 0, btree_iter_cmp); | ||
894 | } | ||
895 | |||
896 | return ret; | ||
897 | } | ||
898 | |||
899 | struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, | ||
900 | struct btree *b, ptr_filter_fn fn) | ||
901 | { | ||
902 | struct bkey *ret; | ||
903 | |||
904 | do { | ||
905 | ret = bch_btree_iter_next(iter); | ||
906 | } while (ret && fn(b, ret)); | ||
907 | |||
908 | return ret; | ||
909 | } | ||
910 | |||
911 | struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search) | ||
912 | { | ||
913 | struct btree_iter iter; | ||
914 | |||
915 | bch_btree_iter_init(b, &iter, search); | ||
916 | return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); | ||
917 | } | ||
918 | |||
919 | /* Mergesort */ | ||
920 | |||
921 | static void btree_sort_fixup(struct btree_iter *iter) | ||
922 | { | ||
923 | while (iter->used > 1) { | ||
924 | struct btree_iter_set *top = iter->data, *i = top + 1; | ||
925 | struct bkey *k; | ||
926 | |||
927 | if (iter->used > 2 && | ||
928 | btree_iter_cmp(i[0], i[1])) | ||
929 | i++; | ||
930 | |||
931 | for (k = i->k; | ||
932 | k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0; | ||
933 | k = bkey_next(k)) | ||
934 | if (top->k > i->k) | ||
935 | __bch_cut_front(top->k, k); | ||
936 | else if (KEY_SIZE(k)) | ||
937 | bch_cut_back(&START_KEY(k), top->k); | ||
938 | |||
939 | if (top->k < i->k || k == i->k) | ||
940 | break; | ||
941 | |||
942 | heap_sift(iter, i - top, btree_iter_cmp); | ||
943 | } | ||
944 | } | ||
945 | |||
946 | static void btree_mergesort(struct btree *b, struct bset *out, | ||
947 | struct btree_iter *iter, | ||
948 | bool fixup, bool remove_stale) | ||
949 | { | ||
950 | struct bkey *k, *last = NULL; | ||
951 | bool (*bad)(struct btree *, const struct bkey *) = remove_stale | ||
952 | ? bch_ptr_bad | ||
953 | : bch_ptr_invalid; | ||
954 | |||
955 | while (!btree_iter_end(iter)) { | ||
956 | if (fixup && !b->level) | ||
957 | btree_sort_fixup(iter); | ||
958 | |||
959 | k = bch_btree_iter_next(iter); | ||
960 | if (bad(b, k)) | ||
961 | continue; | ||
962 | |||
963 | if (!last) { | ||
964 | last = out->start; | ||
965 | bkey_copy(last, k); | ||
966 | } else if (b->level || | ||
967 | !bch_bkey_try_merge(b, last, k)) { | ||
968 | last = bkey_next(last); | ||
969 | bkey_copy(last, k); | ||
970 | } | ||
971 | } | ||
972 | |||
973 | out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0; | ||
974 | |||
975 | pr_debug("sorted %i keys", out->keys); | ||
976 | bch_check_key_order(b, out); | ||
977 | } | ||
978 | |||
979 | static void __btree_sort(struct btree *b, struct btree_iter *iter, | ||
980 | unsigned start, unsigned order, bool fixup) | ||
981 | { | ||
982 | uint64_t start_time; | ||
983 | bool remove_stale = !b->written; | ||
984 | struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, | ||
985 | order); | ||
986 | if (!out) { | ||
987 | mutex_lock(&b->c->sort_lock); | ||
988 | out = b->c->sort; | ||
989 | order = ilog2(bucket_pages(b->c)); | ||
990 | } | ||
991 | |||
992 | start_time = local_clock(); | ||
993 | |||
994 | btree_mergesort(b, out, iter, fixup, remove_stale); | ||
995 | b->nsets = start; | ||
996 | |||
997 | if (!fixup && !start && b->written) | ||
998 | bch_btree_verify(b, out); | ||
999 | |||
1000 | if (!start && order == b->page_order) { | ||
1001 | /* | ||
1002 | * Our temporary buffer is the same size as the btree node's | ||
1003 | * buffer, we can just swap buffers instead of doing a big | ||
1004 | * memcpy() | ||
1005 | */ | ||
1006 | |||
1007 | out->magic = bset_magic(b->c); | ||
1008 | out->seq = b->sets[0].data->seq; | ||
1009 | out->version = b->sets[0].data->version; | ||
1010 | swap(out, b->sets[0].data); | ||
1011 | |||
1012 | if (b->c->sort == b->sets[0].data) | ||
1013 | b->c->sort = out; | ||
1014 | } else { | ||
1015 | b->sets[start].data->keys = out->keys; | ||
1016 | memcpy(b->sets[start].data->start, out->start, | ||
1017 | (void *) end(out) - (void *) out->start); | ||
1018 | } | ||
1019 | |||
1020 | if (out == b->c->sort) | ||
1021 | mutex_unlock(&b->c->sort_lock); | ||
1022 | else | ||
1023 | free_pages((unsigned long) out, order); | ||
1024 | |||
1025 | if (b->written) | ||
1026 | bset_build_written_tree(b); | ||
1027 | |||
1028 | if (!start) { | ||
1029 | spin_lock(&b->c->sort_time_lock); | ||
1030 | bch_time_stats_update(&b->c->sort_time, start_time); | ||
1031 | spin_unlock(&b->c->sort_time_lock); | ||
1032 | } | ||
1033 | } | ||
1034 | |||
1035 | void bch_btree_sort_partial(struct btree *b, unsigned start) | ||
1036 | { | ||
1037 | size_t oldsize = 0, order = b->page_order, keys = 0; | ||
1038 | struct btree_iter iter; | ||
1039 | __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); | ||
1040 | |||
1041 | BUG_ON(b->sets[b->nsets].data == write_block(b) && | ||
1042 | (b->sets[b->nsets].size || b->nsets)); | ||
1043 | |||
1044 | if (b->written) | ||
1045 | oldsize = bch_count_data(b); | ||
1046 | |||
1047 | if (start) { | ||
1048 | unsigned i; | ||
1049 | |||
1050 | for (i = start; i <= b->nsets; i++) | ||
1051 | keys += b->sets[i].data->keys; | ||
1052 | |||
1053 | order = roundup_pow_of_two(__set_bytes(b->sets->data, | ||
1054 | keys)) / PAGE_SIZE; | ||
1055 | if (order) | ||
1056 | order = ilog2(order); | ||
1057 | } | ||
1058 | |||
1059 | __btree_sort(b, &iter, start, order, false); | ||
1060 | |||
1061 | EBUG_ON(b->written && bch_count_data(b) != oldsize); | ||
1062 | } | ||
1063 | |||
1064 | void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) | ||
1065 | { | ||
1066 | BUG_ON(!b->written); | ||
1067 | __btree_sort(b, iter, 0, b->page_order, true); | ||
1068 | } | ||
1069 | |||
1070 | void bch_btree_sort_into(struct btree *b, struct btree *new) | ||
1071 | { | ||
1072 | uint64_t start_time = local_clock(); | ||
1073 | |||
1074 | struct btree_iter iter; | ||
1075 | bch_btree_iter_init(b, &iter, NULL); | ||
1076 | |||
1077 | btree_mergesort(b, new->sets->data, &iter, false, true); | ||
1078 | |||
1079 | spin_lock(&b->c->sort_time_lock); | ||
1080 | bch_time_stats_update(&b->c->sort_time, start_time); | ||
1081 | spin_unlock(&b->c->sort_time_lock); | ||
1082 | |||
1083 | bkey_copy_key(&new->key, &b->key); | ||
1084 | new->sets->size = 0; | ||
1085 | } | ||
1086 | |||
1087 | void bch_btree_sort_lazy(struct btree *b) | ||
1088 | { | ||
1089 | if (b->nsets) { | ||
1090 | unsigned i, j, keys = 0, total; | ||
1091 | |||
1092 | for (i = 0; i <= b->nsets; i++) | ||
1093 | keys += b->sets[i].data->keys; | ||
1094 | |||
1095 | total = keys; | ||
1096 | |||
1097 | for (j = 0; j < b->nsets; j++) { | ||
1098 | if (keys * 2 < total || | ||
1099 | keys < 1000) { | ||
1100 | bch_btree_sort_partial(b, j); | ||
1101 | return; | ||
1102 | } | ||
1103 | |||
1104 | keys -= b->sets[j].data->keys; | ||
1105 | } | ||
1106 | |||
1107 | /* Must sort if b->nsets == 3 or we'll overflow */ | ||
1108 | if (b->nsets >= (MAX_BSETS - 1) - b->level) { | ||
1109 | bch_btree_sort(b); | ||
1110 | return; | ||
1111 | } | ||
1112 | } | ||
1113 | |||
1114 | bset_build_written_tree(b); | ||
1115 | } | ||
1116 | |||
1117 | /* Sysfs stuff */ | ||
1118 | |||
1119 | struct bset_stats { | ||
1120 | size_t nodes; | ||
1121 | size_t sets_written, sets_unwritten; | ||
1122 | size_t bytes_written, bytes_unwritten; | ||
1123 | size_t floats, failed; | ||
1124 | }; | ||
1125 | |||
1126 | static int bch_btree_bset_stats(struct btree *b, struct btree_op *op, | ||
1127 | struct bset_stats *stats) | ||
1128 | { | ||
1129 | struct bkey *k; | ||
1130 | unsigned i; | ||
1131 | |||
1132 | stats->nodes++; | ||
1133 | |||
1134 | for (i = 0; i <= b->nsets; i++) { | ||
1135 | struct bset_tree *t = &b->sets[i]; | ||
1136 | size_t bytes = t->data->keys * sizeof(uint64_t); | ||
1137 | size_t j; | ||
1138 | |||
1139 | if (bset_written(b, t)) { | ||
1140 | stats->sets_written++; | ||
1141 | stats->bytes_written += bytes; | ||
1142 | |||
1143 | stats->floats += t->size - 1; | ||
1144 | |||
1145 | for (j = 1; j < t->size; j++) | ||
1146 | if (t->tree[j].exponent == 127) | ||
1147 | stats->failed++; | ||
1148 | } else { | ||
1149 | stats->sets_unwritten++; | ||
1150 | stats->bytes_unwritten += bytes; | ||
1151 | } | ||
1152 | } | ||
1153 | |||
1154 | if (b->level) { | ||
1155 | struct btree_iter iter; | ||
1156 | |||
1157 | for_each_key_filter(b, k, &iter, bch_ptr_bad) { | ||
1158 | int ret = btree(bset_stats, k, b, op, stats); | ||
1159 | if (ret) | ||
1160 | return ret; | ||
1161 | } | ||
1162 | } | ||
1163 | |||
1164 | return 0; | ||
1165 | } | ||
1166 | |||
1167 | int bch_bset_print_stats(struct cache_set *c, char *buf) | ||
1168 | { | ||
1169 | struct btree_op op; | ||
1170 | struct bset_stats t; | ||
1171 | int ret; | ||
1172 | |||
1173 | bch_btree_op_init_stack(&op); | ||
1174 | memset(&t, 0, sizeof(struct bset_stats)); | ||
1175 | |||
1176 | ret = btree_root(bset_stats, c, &op, &t); | ||
1177 | if (ret) | ||
1178 | return ret; | ||
1179 | |||
1180 | return snprintf(buf, PAGE_SIZE, | ||
1181 | "btree nodes: %zu\n" | ||
1182 | "written sets: %zu\n" | ||
1183 | "unwritten sets: %zu\n" | ||
1184 | "written key bytes: %zu\n" | ||
1185 | "unwritten key bytes: %zu\n" | ||
1186 | "floats: %zu\n" | ||
1187 | "failed: %zu\n", | ||
1188 | t.nodes, | ||
1189 | t.sets_written, t.sets_unwritten, | ||
1190 | t.bytes_written, t.bytes_unwritten, | ||
1191 | t.floats, t.failed); | ||
1192 | } | ||
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h new file mode 100644 index 000000000000..57a9cff41546 --- /dev/null +++ b/drivers/md/bcache/bset.h | |||
@@ -0,0 +1,379 @@ | |||
1 | #ifndef _BCACHE_BSET_H | ||
2 | #define _BCACHE_BSET_H | ||
3 | |||
4 | /* | ||
5 | * BKEYS: | ||
6 | * | ||
7 | * A bkey contains a key, a size field, a variable number of pointers, and some | ||
8 | * ancillary flag bits. | ||
9 | * | ||
10 | * We use two different functions for validating bkeys, bch_ptr_invalid and | ||
11 | * bch_ptr_bad(). | ||
12 | * | ||
13 | * bch_ptr_invalid() primarily filters out keys and pointers that would be | ||
14 | * invalid due to some sort of bug, whereas bch_ptr_bad() filters out keys and | ||
15 | * pointer that occur in normal practice but don't point to real data. | ||
16 | * | ||
17 | * The one exception to the rule that ptr_invalid() filters out invalid keys is | ||
18 | * that it also filters out keys of size 0 - these are keys that have been | ||
19 | * completely overwritten. It'd be safe to delete these in memory while leaving | ||
20 | * them on disk, just unnecessary work - so we filter them out when resorting | ||
21 | * instead. | ||
22 | * | ||
23 | * We can't filter out stale keys when we're resorting, because garbage | ||
24 | * collection needs to find them to ensure bucket gens don't wrap around - | ||
25 | * unless we're rewriting the btree node those stale keys still exist on disk. | ||
26 | * | ||
27 | * We also implement functions here for removing some number of sectors from the | ||
28 | * front or the back of a bkey - this is mainly used for fixing overlapping | ||
29 | * extents, by removing the overlapping sectors from the older key. | ||
30 | * | ||
31 | * BSETS: | ||
32 | * | ||
33 | * A bset is an array of bkeys laid out contiguously in memory in sorted order, | ||
34 | * along with a header. A btree node is made up of a number of these, written at | ||
35 | * different times. | ||
36 | * | ||
37 | * There could be many of them on disk, but we never allow there to be more than | ||
38 | * 4 in memory - we lazily resort as needed. | ||
39 | * | ||
40 | * We implement code here for creating and maintaining auxiliary search trees | ||
41 | * (described below) for searching an individial bset, and on top of that we | ||
42 | * implement a btree iterator. | ||
43 | * | ||
44 | * BTREE ITERATOR: | ||
45 | * | ||
46 | * Most of the code in bcache doesn't care about an individual bset - it needs | ||
47 | * to search entire btree nodes and iterate over them in sorted order. | ||
48 | * | ||
49 | * The btree iterator code serves both functions; it iterates through the keys | ||
50 | * in a btree node in sorted order, starting from either keys after a specific | ||
51 | * point (if you pass it a search key) or the start of the btree node. | ||
52 | * | ||
53 | * AUXILIARY SEARCH TREES: | ||
54 | * | ||
55 | * Since keys are variable length, we can't use a binary search on a bset - we | ||
56 | * wouldn't be able to find the start of the next key. But binary searches are | ||
57 | * slow anyways, due to terrible cache behaviour; bcache originally used binary | ||
58 | * searches and that code topped out at under 50k lookups/second. | ||
59 | * | ||
60 | * So we need to construct some sort of lookup table. Since we only insert keys | ||
61 | * into the last (unwritten) set, most of the keys within a given btree node are | ||
62 | * usually in sets that are mostly constant. We use two different types of | ||
63 | * lookup tables to take advantage of this. | ||
64 | * | ||
65 | * Both lookup tables share in common that they don't index every key in the | ||
66 | * set; they index one key every BSET_CACHELINE bytes, and then a linear search | ||
67 | * is used for the rest. | ||
68 | * | ||
69 | * For sets that have been written to disk and are no longer being inserted | ||
70 | * into, we construct a binary search tree in an array - traversing a binary | ||
71 | * search tree in an array gives excellent locality of reference and is very | ||
72 | * fast, since both children of any node are adjacent to each other in memory | ||
73 | * (and their grandchildren, and great grandchildren...) - this means | ||
74 | * prefetching can be used to great effect. | ||
75 | * | ||
76 | * It's quite useful performance wise to keep these nodes small - not just | ||
77 | * because they're more likely to be in L2, but also because we can prefetch | ||
78 | * more nodes on a single cacheline and thus prefetch more iterations in advance | ||
79 | * when traversing this tree. | ||
80 | * | ||
81 | * Nodes in the auxiliary search tree must contain both a key to compare against | ||
82 | * (we don't want to fetch the key from the set, that would defeat the purpose), | ||
83 | * and a pointer to the key. We use a few tricks to compress both of these. | ||
84 | * | ||
85 | * To compress the pointer, we take advantage of the fact that one node in the | ||
86 | * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have | ||
87 | * a function (to_inorder()) that takes the index of a node in a binary tree and | ||
88 | * returns what its index would be in an inorder traversal, so we only have to | ||
89 | * store the low bits of the offset. | ||
90 | * | ||
91 | * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To | ||
92 | * compress that, we take advantage of the fact that when we're traversing the | ||
93 | * search tree at every iteration we know that both our search key and the key | ||
94 | * we're looking for lie within some range - bounded by our previous | ||
95 | * comparisons. (We special case the start of a search so that this is true even | ||
96 | * at the root of the tree). | ||
97 | * | ||
98 | * So we know the key we're looking for is between a and b, and a and b don't | ||
99 | * differ higher than bit 50, we don't need to check anything higher than bit | ||
100 | * 50. | ||
101 | * | ||
102 | * We don't usually need the rest of the bits, either; we only need enough bits | ||
103 | * to partition the key range we're currently checking. Consider key n - the | ||
104 | * key our auxiliary search tree node corresponds to, and key p, the key | ||
105 | * immediately preceding n. The lowest bit we need to store in the auxiliary | ||
106 | * search tree is the highest bit that differs between n and p. | ||
107 | * | ||
108 | * Note that this could be bit 0 - we might sometimes need all 80 bits to do the | ||
109 | * comparison. But we'd really like our nodes in the auxiliary search tree to be | ||
110 | * of fixed size. | ||
111 | * | ||
112 | * The solution is to make them fixed size, and when we're constructing a node | ||
113 | * check if p and n differed in the bits we needed them to. If they don't we | ||
114 | * flag that node, and when doing lookups we fallback to comparing against the | ||
115 | * real key. As long as this doesn't happen to often (and it seems to reliably | ||
116 | * happen a bit less than 1% of the time), we win - even on failures, that key | ||
117 | * is then more likely to be in cache than if we were doing binary searches all | ||
118 | * the way, since we're touching so much less memory. | ||
119 | * | ||
120 | * The keys in the auxiliary search tree are stored in (software) floating | ||
121 | * point, with an exponent and a mantissa. The exponent needs to be big enough | ||
122 | * to address all the bits in the original key, but the number of bits in the | ||
123 | * mantissa is somewhat arbitrary; more bits just gets us fewer failures. | ||
124 | * | ||
125 | * We need 7 bits for the exponent and 3 bits for the key's offset (since keys | ||
126 | * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. | ||
127 | * We need one node per 128 bytes in the btree node, which means the auxiliary | ||
128 | * search trees take up 3% as much memory as the btree itself. | ||
129 | * | ||
130 | * Constructing these auxiliary search trees is moderately expensive, and we | ||
131 | * don't want to be constantly rebuilding the search tree for the last set | ||
132 | * whenever we insert another key into it. For the unwritten set, we use a much | ||
133 | * simpler lookup table - it's just a flat array, so index i in the lookup table | ||
134 | * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing | ||
135 | * within each byte range works the same as with the auxiliary search trees. | ||
136 | * | ||
137 | * These are much easier to keep up to date when we insert a key - we do it | ||
138 | * somewhat lazily; when we shift a key up we usually just increment the pointer | ||
139 | * to it, only when it would overflow do we go to the trouble of finding the | ||
140 | * first key in that range of bytes again. | ||
141 | */ | ||
142 | |||
143 | /* Btree key comparison/iteration */ | ||
144 | |||
145 | struct btree_iter { | ||
146 | size_t size, used; | ||
147 | struct btree_iter_set { | ||
148 | struct bkey *k, *end; | ||
149 | } data[MAX_BSETS]; | ||
150 | }; | ||
151 | |||
152 | struct bset_tree { | ||
153 | /* | ||
154 | * We construct a binary tree in an array as if the array | ||
155 | * started at 1, so that things line up on the same cachelines | ||
156 | * better: see comments in bset.c at cacheline_to_bkey() for | ||
157 | * details | ||
158 | */ | ||
159 | |||
160 | /* size of the binary tree and prev array */ | ||
161 | unsigned size; | ||
162 | |||
163 | /* function of size - precalculated for to_inorder() */ | ||
164 | unsigned extra; | ||
165 | |||
166 | /* copy of the last key in the set */ | ||
167 | struct bkey end; | ||
168 | struct bkey_float *tree; | ||
169 | |||
170 | /* | ||
171 | * The nodes in the bset tree point to specific keys - this | ||
172 | * array holds the sizes of the previous key. | ||
173 | * | ||
174 | * Conceptually it's a member of struct bkey_float, but we want | ||
175 | * to keep bkey_float to 4 bytes and prev isn't used in the fast | ||
176 | * path. | ||
177 | */ | ||
178 | uint8_t *prev; | ||
179 | |||
180 | /* The actual btree node, with pointers to each sorted set */ | ||
181 | struct bset *data; | ||
182 | }; | ||
183 | |||
184 | static __always_inline int64_t bkey_cmp(const struct bkey *l, | ||
185 | const struct bkey *r) | ||
186 | { | ||
187 | return unlikely(KEY_INODE(l) != KEY_INODE(r)) | ||
188 | ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r) | ||
189 | : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); | ||
190 | } | ||
191 | |||
192 | static inline size_t bkey_u64s(const struct bkey *k) | ||
193 | { | ||
194 | BUG_ON(KEY_CSUM(k) > 1); | ||
195 | return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0); | ||
196 | } | ||
197 | |||
198 | static inline size_t bkey_bytes(const struct bkey *k) | ||
199 | { | ||
200 | return bkey_u64s(k) * sizeof(uint64_t); | ||
201 | } | ||
202 | |||
203 | static inline void bkey_copy(struct bkey *dest, const struct bkey *src) | ||
204 | { | ||
205 | memcpy(dest, src, bkey_bytes(src)); | ||
206 | } | ||
207 | |||
208 | static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) | ||
209 | { | ||
210 | if (!src) | ||
211 | src = &KEY(0, 0, 0); | ||
212 | |||
213 | SET_KEY_INODE(dest, KEY_INODE(src)); | ||
214 | SET_KEY_OFFSET(dest, KEY_OFFSET(src)); | ||
215 | } | ||
216 | |||
217 | static inline struct bkey *bkey_next(const struct bkey *k) | ||
218 | { | ||
219 | uint64_t *d = (void *) k; | ||
220 | return (struct bkey *) (d + bkey_u64s(k)); | ||
221 | } | ||
222 | |||
223 | /* Keylists */ | ||
224 | |||
225 | struct keylist { | ||
226 | struct bkey *top; | ||
227 | union { | ||
228 | uint64_t *list; | ||
229 | struct bkey *bottom; | ||
230 | }; | ||
231 | |||
232 | /* Enough room for btree_split's keys without realloc */ | ||
233 | #define KEYLIST_INLINE 16 | ||
234 | uint64_t d[KEYLIST_INLINE]; | ||
235 | }; | ||
236 | |||
237 | static inline void bch_keylist_init(struct keylist *l) | ||
238 | { | ||
239 | l->top = (void *) (l->list = l->d); | ||
240 | } | ||
241 | |||
242 | static inline void bch_keylist_push(struct keylist *l) | ||
243 | { | ||
244 | l->top = bkey_next(l->top); | ||
245 | } | ||
246 | |||
247 | static inline void bch_keylist_add(struct keylist *l, struct bkey *k) | ||
248 | { | ||
249 | bkey_copy(l->top, k); | ||
250 | bch_keylist_push(l); | ||
251 | } | ||
252 | |||
253 | static inline bool bch_keylist_empty(struct keylist *l) | ||
254 | { | ||
255 | return l->top == (void *) l->list; | ||
256 | } | ||
257 | |||
258 | static inline void bch_keylist_free(struct keylist *l) | ||
259 | { | ||
260 | if (l->list != l->d) | ||
261 | kfree(l->list); | ||
262 | } | ||
263 | |||
264 | void bch_keylist_copy(struct keylist *, struct keylist *); | ||
265 | struct bkey *bch_keylist_pop(struct keylist *); | ||
266 | int bch_keylist_realloc(struct keylist *, int, struct cache_set *); | ||
267 | |||
268 | void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, | ||
269 | unsigned); | ||
270 | bool __bch_cut_front(const struct bkey *, struct bkey *); | ||
271 | bool __bch_cut_back(const struct bkey *, struct bkey *); | ||
272 | |||
273 | static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) | ||
274 | { | ||
275 | BUG_ON(bkey_cmp(where, k) > 0); | ||
276 | return __bch_cut_front(where, k); | ||
277 | } | ||
278 | |||
279 | static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) | ||
280 | { | ||
281 | BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); | ||
282 | return __bch_cut_back(where, k); | ||
283 | } | ||
284 | |||
285 | const char *bch_ptr_status(struct cache_set *, const struct bkey *); | ||
286 | bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *); | ||
287 | bool bch_ptr_bad(struct btree *, const struct bkey *); | ||
288 | |||
289 | static inline uint8_t gen_after(uint8_t a, uint8_t b) | ||
290 | { | ||
291 | uint8_t r = a - b; | ||
292 | return r > 128U ? 0 : r; | ||
293 | } | ||
294 | |||
295 | static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, | ||
296 | unsigned i) | ||
297 | { | ||
298 | return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); | ||
299 | } | ||
300 | |||
301 | static inline bool ptr_available(struct cache_set *c, const struct bkey *k, | ||
302 | unsigned i) | ||
303 | { | ||
304 | return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); | ||
305 | } | ||
306 | |||
307 | |||
308 | typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); | ||
309 | |||
310 | struct bkey *bch_next_recurse_key(struct btree *, struct bkey *); | ||
311 | struct bkey *bch_btree_iter_next(struct btree_iter *); | ||
312 | struct bkey *bch_btree_iter_next_filter(struct btree_iter *, | ||
313 | struct btree *, ptr_filter_fn); | ||
314 | |||
315 | void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); | ||
316 | struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *, | ||
317 | struct bkey *, struct bset_tree *); | ||
318 | |||
319 | /* 32 bits total: */ | ||
320 | #define BKEY_MID_BITS 3 | ||
321 | #define BKEY_EXPONENT_BITS 7 | ||
322 | #define BKEY_MANTISSA_BITS 22 | ||
323 | #define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) | ||
324 | |||
325 | struct bkey_float { | ||
326 | unsigned exponent:BKEY_EXPONENT_BITS; | ||
327 | unsigned m:BKEY_MID_BITS; | ||
328 | unsigned mantissa:BKEY_MANTISSA_BITS; | ||
329 | } __packed; | ||
330 | |||
331 | /* | ||
332 | * BSET_CACHELINE was originally intended to match the hardware cacheline size - | ||
333 | * it used to be 64, but I realized the lookup code would touch slightly less | ||
334 | * memory if it was 128. | ||
335 | * | ||
336 | * It definites the number of bytes (in struct bset) per struct bkey_float in | ||
337 | * the auxiliar search tree - when we're done searching the bset_float tree we | ||
338 | * have this many bytes left that we do a linear search over. | ||
339 | * | ||
340 | * Since (after level 5) every level of the bset_tree is on a new cacheline, | ||
341 | * we're touching one fewer cacheline in the bset tree in exchange for one more | ||
342 | * cacheline in the linear search - but the linear search might stop before it | ||
343 | * gets to the second cacheline. | ||
344 | */ | ||
345 | |||
346 | #define BSET_CACHELINE 128 | ||
347 | #define bset_tree_space(b) (btree_data_space(b) / BSET_CACHELINE) | ||
348 | |||
349 | #define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float)) | ||
350 | #define bset_prev_bytes(b) (bset_tree_space(b) * sizeof(uint8_t)) | ||
351 | |||
352 | void bch_bset_init_next(struct btree *); | ||
353 | |||
354 | void bch_bset_fix_invalidated_key(struct btree *, struct bkey *); | ||
355 | void bch_bset_fix_lookup_table(struct btree *, struct bkey *); | ||
356 | |||
357 | struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, | ||
358 | const struct bkey *); | ||
359 | |||
360 | static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, | ||
361 | const struct bkey *search) | ||
362 | { | ||
363 | return search ? __bch_bset_search(b, t, search) : t->data->start; | ||
364 | } | ||
365 | |||
366 | bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); | ||
367 | void bch_btree_sort_lazy(struct btree *); | ||
368 | void bch_btree_sort_into(struct btree *, struct btree *); | ||
369 | void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *); | ||
370 | void bch_btree_sort_partial(struct btree *, unsigned); | ||
371 | |||
372 | static inline void bch_btree_sort(struct btree *b) | ||
373 | { | ||
374 | bch_btree_sort_partial(b, 0); | ||
375 | } | ||
376 | |||
377 | int bch_bset_print_stats(struct cache_set *, char *); | ||
378 | |||
379 | #endif | ||
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c new file mode 100644 index 000000000000..7a5658f04e62 --- /dev/null +++ b/drivers/md/bcache/btree.c | |||
@@ -0,0 +1,2503 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> | ||
3 | * | ||
4 | * Uses a block device as cache for other block devices; optimized for SSDs. | ||
5 | * All allocation is done in buckets, which should match the erase block size | ||
6 | * of the device. | ||
7 | * | ||
8 | * Buckets containing cached data are kept on a heap sorted by priority; | ||
9 | * bucket priority is increased on cache hit, and periodically all the buckets | ||
10 | * on the heap have their priority scaled down. This currently is just used as | ||
11 | * an LRU but in the future should allow for more intelligent heuristics. | ||
12 | * | ||
13 | * Buckets have an 8 bit counter; freeing is accomplished by incrementing the | ||
14 | * counter. Garbage collection is used to remove stale pointers. | ||
15 | * | ||
16 | * Indexing is done via a btree; nodes are not necessarily fully sorted, rather | ||
17 | * as keys are inserted we only sort the pages that have not yet been written. | ||
18 | * When garbage collection is run, we resort the entire node. | ||
19 | * | ||
20 | * All configuration is done via sysfs; see Documentation/bcache.txt. | ||
21 | */ | ||
22 | |||
23 | #include "bcache.h" | ||
24 | #include "btree.h" | ||
25 | #include "debug.h" | ||
26 | #include "request.h" | ||
27 | |||
28 | #include <linux/slab.h> | ||
29 | #include <linux/bitops.h> | ||
30 | #include <linux/hash.h> | ||
31 | #include <linux/prefetch.h> | ||
32 | #include <linux/random.h> | ||
33 | #include <linux/rcupdate.h> | ||
34 | #include <trace/events/bcache.h> | ||
35 | |||
36 | /* | ||
37 | * Todo: | ||
38 | * register_bcache: Return errors out to userspace correctly | ||
39 | * | ||
40 | * Writeback: don't undirty key until after a cache flush | ||
41 | * | ||
42 | * Create an iterator for key pointers | ||
43 | * | ||
44 | * On btree write error, mark bucket such that it won't be freed from the cache | ||
45 | * | ||
46 | * Journalling: | ||
47 | * Check for bad keys in replay | ||
48 | * Propagate barriers | ||
49 | * Refcount journal entries in journal_replay | ||
50 | * | ||
51 | * Garbage collection: | ||
52 | * Finish incremental gc | ||
53 | * Gc should free old UUIDs, data for invalid UUIDs | ||
54 | * | ||
55 | * Provide a way to list backing device UUIDs we have data cached for, and | ||
56 | * probably how long it's been since we've seen them, and a way to invalidate | ||
57 | * dirty data for devices that will never be attached again | ||
58 | * | ||
59 | * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so | ||
60 | * that based on that and how much dirty data we have we can keep writeback | ||
61 | * from being starved | ||
62 | * | ||
63 | * Add a tracepoint or somesuch to watch for writeback starvation | ||
64 | * | ||
65 | * When btree depth > 1 and splitting an interior node, we have to make sure | ||
66 | * alloc_bucket() cannot fail. This should be true but is not completely | ||
67 | * obvious. | ||
68 | * | ||
69 | * Make sure all allocations get charged to the root cgroup | ||
70 | * | ||
71 | * Plugging? | ||
72 | * | ||
73 | * If data write is less than hard sector size of ssd, round up offset in open | ||
74 | * bucket to the next whole sector | ||
75 | * | ||
76 | * Also lookup by cgroup in get_open_bucket() | ||
77 | * | ||
78 | * Superblock needs to be fleshed out for multiple cache devices | ||
79 | * | ||
80 | * Add a sysfs tunable for the number of writeback IOs in flight | ||
81 | * | ||
82 | * Add a sysfs tunable for the number of open data buckets | ||
83 | * | ||
84 | * IO tracking: Can we track when one process is doing io on behalf of another? | ||
85 | * IO tracking: Don't use just an average, weigh more recent stuff higher | ||
86 | * | ||
87 | * Test module load/unload | ||
88 | */ | ||
89 | |||
90 | static const char * const op_types[] = { | ||
91 | "insert", "replace" | ||
92 | }; | ||
93 | |||
94 | static const char *op_type(struct btree_op *op) | ||
95 | { | ||
96 | return op_types[op->type]; | ||
97 | } | ||
98 | |||
99 | #define MAX_NEED_GC 64 | ||
100 | #define MAX_SAVE_PRIO 72 | ||
101 | |||
102 | #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) | ||
103 | |||
104 | #define PTR_HASH(c, k) \ | ||
105 | (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) | ||
106 | |||
107 | struct workqueue_struct *bch_gc_wq; | ||
108 | static struct workqueue_struct *btree_io_wq; | ||
109 | |||
110 | void bch_btree_op_init_stack(struct btree_op *op) | ||
111 | { | ||
112 | memset(op, 0, sizeof(struct btree_op)); | ||
113 | closure_init_stack(&op->cl); | ||
114 | op->lock = -1; | ||
115 | bch_keylist_init(&op->keys); | ||
116 | } | ||
117 | |||
118 | /* Btree key manipulation */ | ||
119 | |||
120 | static void bkey_put(struct cache_set *c, struct bkey *k, int level) | ||
121 | { | ||
122 | if ((level && KEY_OFFSET(k)) || !level) | ||
123 | __bkey_put(c, k); | ||
124 | } | ||
125 | |||
126 | /* Btree IO */ | ||
127 | |||
128 | static uint64_t btree_csum_set(struct btree *b, struct bset *i) | ||
129 | { | ||
130 | uint64_t crc = b->key.ptr[0]; | ||
131 | void *data = (void *) i + 8, *end = end(i); | ||
132 | |||
133 | crc = bch_crc64_update(crc, data, end - data); | ||
134 | return crc ^ 0xffffffffffffffffULL; | ||
135 | } | ||
136 | |||
137 | static void btree_bio_endio(struct bio *bio, int error) | ||
138 | { | ||
139 | struct closure *cl = bio->bi_private; | ||
140 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
141 | |||
142 | if (error) | ||
143 | set_btree_node_io_error(b); | ||
144 | |||
145 | bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE) | ||
146 | ? "writing btree" : "reading btree"); | ||
147 | closure_put(cl); | ||
148 | } | ||
149 | |||
150 | static void btree_bio_init(struct btree *b) | ||
151 | { | ||
152 | BUG_ON(b->bio); | ||
153 | b->bio = bch_bbio_alloc(b->c); | ||
154 | |||
155 | b->bio->bi_end_io = btree_bio_endio; | ||
156 | b->bio->bi_private = &b->io.cl; | ||
157 | } | ||
158 | |||
159 | void bch_btree_read_done(struct closure *cl) | ||
160 | { | ||
161 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
162 | struct bset *i = b->sets[0].data; | ||
163 | struct btree_iter *iter = b->c->fill_iter; | ||
164 | const char *err = "bad btree header"; | ||
165 | BUG_ON(b->nsets || b->written); | ||
166 | |||
167 | bch_bbio_free(b->bio, b->c); | ||
168 | b->bio = NULL; | ||
169 | |||
170 | mutex_lock(&b->c->fill_lock); | ||
171 | iter->used = 0; | ||
172 | |||
173 | if (btree_node_io_error(b) || | ||
174 | !i->seq) | ||
175 | goto err; | ||
176 | |||
177 | for (; | ||
178 | b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; | ||
179 | i = write_block(b)) { | ||
180 | err = "unsupported bset version"; | ||
181 | if (i->version > BCACHE_BSET_VERSION) | ||
182 | goto err; | ||
183 | |||
184 | err = "bad btree header"; | ||
185 | if (b->written + set_blocks(i, b->c) > btree_blocks(b)) | ||
186 | goto err; | ||
187 | |||
188 | err = "bad magic"; | ||
189 | if (i->magic != bset_magic(b->c)) | ||
190 | goto err; | ||
191 | |||
192 | err = "bad checksum"; | ||
193 | switch (i->version) { | ||
194 | case 0: | ||
195 | if (i->csum != csum_set(i)) | ||
196 | goto err; | ||
197 | break; | ||
198 | case BCACHE_BSET_VERSION: | ||
199 | if (i->csum != btree_csum_set(b, i)) | ||
200 | goto err; | ||
201 | break; | ||
202 | } | ||
203 | |||
204 | err = "empty set"; | ||
205 | if (i != b->sets[0].data && !i->keys) | ||
206 | goto err; | ||
207 | |||
208 | bch_btree_iter_push(iter, i->start, end(i)); | ||
209 | |||
210 | b->written += set_blocks(i, b->c); | ||
211 | } | ||
212 | |||
213 | err = "corrupted btree"; | ||
214 | for (i = write_block(b); | ||
215 | index(i, b) < btree_blocks(b); | ||
216 | i = ((void *) i) + block_bytes(b->c)) | ||
217 | if (i->seq == b->sets[0].data->seq) | ||
218 | goto err; | ||
219 | |||
220 | bch_btree_sort_and_fix_extents(b, iter); | ||
221 | |||
222 | i = b->sets[0].data; | ||
223 | err = "short btree key"; | ||
224 | if (b->sets[0].size && | ||
225 | bkey_cmp(&b->key, &b->sets[0].end) < 0) | ||
226 | goto err; | ||
227 | |||
228 | if (b->written < btree_blocks(b)) | ||
229 | bch_bset_init_next(b); | ||
230 | out: | ||
231 | |||
232 | mutex_unlock(&b->c->fill_lock); | ||
233 | |||
234 | spin_lock(&b->c->btree_read_time_lock); | ||
235 | bch_time_stats_update(&b->c->btree_read_time, b->io_start_time); | ||
236 | spin_unlock(&b->c->btree_read_time_lock); | ||
237 | |||
238 | smp_wmb(); /* read_done is our write lock */ | ||
239 | set_btree_node_read_done(b); | ||
240 | |||
241 | closure_return(cl); | ||
242 | err: | ||
243 | set_btree_node_io_error(b); | ||
244 | bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", | ||
245 | err, PTR_BUCKET_NR(b->c, &b->key, 0), | ||
246 | index(i, b), i->keys); | ||
247 | goto out; | ||
248 | } | ||
249 | |||
250 | void bch_btree_read(struct btree *b) | ||
251 | { | ||
252 | BUG_ON(b->nsets || b->written); | ||
253 | |||
254 | if (!closure_trylock(&b->io.cl, &b->c->cl)) | ||
255 | BUG(); | ||
256 | |||
257 | b->io_start_time = local_clock(); | ||
258 | |||
259 | btree_bio_init(b); | ||
260 | b->bio->bi_rw = REQ_META|READ_SYNC; | ||
261 | b->bio->bi_size = KEY_SIZE(&b->key) << 9; | ||
262 | |||
263 | bch_bio_map(b->bio, b->sets[0].data); | ||
264 | |||
265 | pr_debug("%s", pbtree(b)); | ||
266 | trace_bcache_btree_read(b->bio); | ||
267 | bch_submit_bbio(b->bio, b->c, &b->key, 0); | ||
268 | |||
269 | continue_at(&b->io.cl, bch_btree_read_done, system_wq); | ||
270 | } | ||
271 | |||
272 | static void btree_complete_write(struct btree *b, struct btree_write *w) | ||
273 | { | ||
274 | if (w->prio_blocked && | ||
275 | !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) | ||
276 | wake_up(&b->c->alloc_wait); | ||
277 | |||
278 | if (w->journal) { | ||
279 | atomic_dec_bug(w->journal); | ||
280 | __closure_wake_up(&b->c->journal.wait); | ||
281 | } | ||
282 | |||
283 | if (w->owner) | ||
284 | closure_put(w->owner); | ||
285 | |||
286 | w->prio_blocked = 0; | ||
287 | w->journal = NULL; | ||
288 | w->owner = NULL; | ||
289 | } | ||
290 | |||
291 | static void __btree_write_done(struct closure *cl) | ||
292 | { | ||
293 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
294 | struct btree_write *w = btree_prev_write(b); | ||
295 | |||
296 | bch_bbio_free(b->bio, b->c); | ||
297 | b->bio = NULL; | ||
298 | btree_complete_write(b, w); | ||
299 | |||
300 | if (btree_node_dirty(b)) | ||
301 | queue_delayed_work(btree_io_wq, &b->work, | ||
302 | msecs_to_jiffies(30000)); | ||
303 | |||
304 | closure_return(cl); | ||
305 | } | ||
306 | |||
307 | static void btree_write_done(struct closure *cl) | ||
308 | { | ||
309 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
310 | struct bio_vec *bv; | ||
311 | int n; | ||
312 | |||
313 | __bio_for_each_segment(bv, b->bio, n, 0) | ||
314 | __free_page(bv->bv_page); | ||
315 | |||
316 | __btree_write_done(cl); | ||
317 | } | ||
318 | |||
319 | static void do_btree_write(struct btree *b) | ||
320 | { | ||
321 | struct closure *cl = &b->io.cl; | ||
322 | struct bset *i = b->sets[b->nsets].data; | ||
323 | BKEY_PADDED(key) k; | ||
324 | |||
325 | i->version = BCACHE_BSET_VERSION; | ||
326 | i->csum = btree_csum_set(b, i); | ||
327 | |||
328 | btree_bio_init(b); | ||
329 | b->bio->bi_rw = REQ_META|WRITE_SYNC; | ||
330 | b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); | ||
331 | bch_bio_map(b->bio, i); | ||
332 | |||
333 | bkey_copy(&k.key, &b->key); | ||
334 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); | ||
335 | |||
336 | if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { | ||
337 | int j; | ||
338 | struct bio_vec *bv; | ||
339 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); | ||
340 | |||
341 | bio_for_each_segment(bv, b->bio, j) | ||
342 | memcpy(page_address(bv->bv_page), | ||
343 | base + j * PAGE_SIZE, PAGE_SIZE); | ||
344 | |||
345 | trace_bcache_btree_write(b->bio); | ||
346 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | ||
347 | |||
348 | continue_at(cl, btree_write_done, NULL); | ||
349 | } else { | ||
350 | b->bio->bi_vcnt = 0; | ||
351 | bch_bio_map(b->bio, i); | ||
352 | |||
353 | trace_bcache_btree_write(b->bio); | ||
354 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | ||
355 | |||
356 | closure_sync(cl); | ||
357 | __btree_write_done(cl); | ||
358 | } | ||
359 | } | ||
360 | |||
361 | static void __btree_write(struct btree *b) | ||
362 | { | ||
363 | struct bset *i = b->sets[b->nsets].data; | ||
364 | |||
365 | BUG_ON(current->bio_list); | ||
366 | |||
367 | closure_lock(&b->io, &b->c->cl); | ||
368 | cancel_delayed_work(&b->work); | ||
369 | |||
370 | clear_bit(BTREE_NODE_dirty, &b->flags); | ||
371 | change_bit(BTREE_NODE_write_idx, &b->flags); | ||
372 | |||
373 | bch_check_key_order(b, i); | ||
374 | BUG_ON(b->written && !i->keys); | ||
375 | |||
376 | do_btree_write(b); | ||
377 | |||
378 | pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); | ||
379 | |||
380 | b->written += set_blocks(i, b->c); | ||
381 | atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, | ||
382 | &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); | ||
383 | |||
384 | bch_btree_sort_lazy(b); | ||
385 | |||
386 | if (b->written < btree_blocks(b)) | ||
387 | bch_bset_init_next(b); | ||
388 | } | ||
389 | |||
390 | static void btree_write_work(struct work_struct *w) | ||
391 | { | ||
392 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); | ||
393 | |||
394 | down_write(&b->lock); | ||
395 | |||
396 | if (btree_node_dirty(b)) | ||
397 | __btree_write(b); | ||
398 | up_write(&b->lock); | ||
399 | } | ||
400 | |||
401 | void bch_btree_write(struct btree *b, bool now, struct btree_op *op) | ||
402 | { | ||
403 | struct bset *i = b->sets[b->nsets].data; | ||
404 | struct btree_write *w = btree_current_write(b); | ||
405 | |||
406 | BUG_ON(b->written && | ||
407 | (b->written >= btree_blocks(b) || | ||
408 | i->seq != b->sets[0].data->seq || | ||
409 | !i->keys)); | ||
410 | |||
411 | if (!btree_node_dirty(b)) { | ||
412 | set_btree_node_dirty(b); | ||
413 | queue_delayed_work(btree_io_wq, &b->work, | ||
414 | msecs_to_jiffies(30000)); | ||
415 | } | ||
416 | |||
417 | w->prio_blocked += b->prio_blocked; | ||
418 | b->prio_blocked = 0; | ||
419 | |||
420 | if (op && op->journal && !b->level) { | ||
421 | if (w->journal && | ||
422 | journal_pin_cmp(b->c, w, op)) { | ||
423 | atomic_dec_bug(w->journal); | ||
424 | w->journal = NULL; | ||
425 | } | ||
426 | |||
427 | if (!w->journal) { | ||
428 | w->journal = op->journal; | ||
429 | atomic_inc(w->journal); | ||
430 | } | ||
431 | } | ||
432 | |||
433 | if (current->bio_list) | ||
434 | return; | ||
435 | |||
436 | /* Force write if set is too big */ | ||
437 | if (now || | ||
438 | b->level || | ||
439 | set_bytes(i) > PAGE_SIZE - 48) { | ||
440 | if (op && now) { | ||
441 | /* Must wait on multiple writes */ | ||
442 | BUG_ON(w->owner); | ||
443 | w->owner = &op->cl; | ||
444 | closure_get(&op->cl); | ||
445 | } | ||
446 | |||
447 | __btree_write(b); | ||
448 | } | ||
449 | BUG_ON(!b->written); | ||
450 | } | ||
451 | |||
452 | /* | ||
453 | * Btree in memory cache - allocation/freeing | ||
454 | * mca -> memory cache | ||
455 | */ | ||
456 | |||
457 | static void mca_reinit(struct btree *b) | ||
458 | { | ||
459 | unsigned i; | ||
460 | |||
461 | b->flags = 0; | ||
462 | b->written = 0; | ||
463 | b->nsets = 0; | ||
464 | |||
465 | for (i = 0; i < MAX_BSETS; i++) | ||
466 | b->sets[i].size = 0; | ||
467 | /* | ||
468 | * Second loop starts at 1 because b->sets[0]->data is the memory we | ||
469 | * allocated | ||
470 | */ | ||
471 | for (i = 1; i < MAX_BSETS; i++) | ||
472 | b->sets[i].data = NULL; | ||
473 | } | ||
474 | |||
475 | #define mca_reserve(c) (((c->root && c->root->level) \ | ||
476 | ? c->root->level : 1) * 8 + 16) | ||
477 | #define mca_can_free(c) \ | ||
478 | max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) | ||
479 | |||
480 | static void mca_data_free(struct btree *b) | ||
481 | { | ||
482 | struct bset_tree *t = b->sets; | ||
483 | BUG_ON(!closure_is_unlocked(&b->io.cl)); | ||
484 | |||
485 | if (bset_prev_bytes(b) < PAGE_SIZE) | ||
486 | kfree(t->prev); | ||
487 | else | ||
488 | free_pages((unsigned long) t->prev, | ||
489 | get_order(bset_prev_bytes(b))); | ||
490 | |||
491 | if (bset_tree_bytes(b) < PAGE_SIZE) | ||
492 | kfree(t->tree); | ||
493 | else | ||
494 | free_pages((unsigned long) t->tree, | ||
495 | get_order(bset_tree_bytes(b))); | ||
496 | |||
497 | free_pages((unsigned long) t->data, b->page_order); | ||
498 | |||
499 | t->prev = NULL; | ||
500 | t->tree = NULL; | ||
501 | t->data = NULL; | ||
502 | list_move(&b->list, &b->c->btree_cache_freed); | ||
503 | b->c->bucket_cache_used--; | ||
504 | } | ||
505 | |||
506 | static void mca_bucket_free(struct btree *b) | ||
507 | { | ||
508 | BUG_ON(btree_node_dirty(b)); | ||
509 | |||
510 | b->key.ptr[0] = 0; | ||
511 | hlist_del_init_rcu(&b->hash); | ||
512 | list_move(&b->list, &b->c->btree_cache_freeable); | ||
513 | } | ||
514 | |||
515 | static unsigned btree_order(struct bkey *k) | ||
516 | { | ||
517 | return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); | ||
518 | } | ||
519 | |||
520 | static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) | ||
521 | { | ||
522 | struct bset_tree *t = b->sets; | ||
523 | BUG_ON(t->data); | ||
524 | |||
525 | b->page_order = max_t(unsigned, | ||
526 | ilog2(b->c->btree_pages), | ||
527 | btree_order(k)); | ||
528 | |||
529 | t->data = (void *) __get_free_pages(gfp, b->page_order); | ||
530 | if (!t->data) | ||
531 | goto err; | ||
532 | |||
533 | t->tree = bset_tree_bytes(b) < PAGE_SIZE | ||
534 | ? kmalloc(bset_tree_bytes(b), gfp) | ||
535 | : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); | ||
536 | if (!t->tree) | ||
537 | goto err; | ||
538 | |||
539 | t->prev = bset_prev_bytes(b) < PAGE_SIZE | ||
540 | ? kmalloc(bset_prev_bytes(b), gfp) | ||
541 | : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); | ||
542 | if (!t->prev) | ||
543 | goto err; | ||
544 | |||
545 | list_move(&b->list, &b->c->btree_cache); | ||
546 | b->c->bucket_cache_used++; | ||
547 | return; | ||
548 | err: | ||
549 | mca_data_free(b); | ||
550 | } | ||
551 | |||
552 | static struct btree *mca_bucket_alloc(struct cache_set *c, | ||
553 | struct bkey *k, gfp_t gfp) | ||
554 | { | ||
555 | struct btree *b = kzalloc(sizeof(struct btree), gfp); | ||
556 | if (!b) | ||
557 | return NULL; | ||
558 | |||
559 | init_rwsem(&b->lock); | ||
560 | lockdep_set_novalidate_class(&b->lock); | ||
561 | INIT_LIST_HEAD(&b->list); | ||
562 | INIT_DELAYED_WORK(&b->work, btree_write_work); | ||
563 | b->c = c; | ||
564 | closure_init_unlocked(&b->io); | ||
565 | |||
566 | mca_data_alloc(b, k, gfp); | ||
567 | return b; | ||
568 | } | ||
569 | |||
570 | static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) | ||
571 | { | ||
572 | lockdep_assert_held(&b->c->bucket_lock); | ||
573 | |||
574 | if (!down_write_trylock(&b->lock)) | ||
575 | return -ENOMEM; | ||
576 | |||
577 | if (b->page_order < min_order) { | ||
578 | rw_unlock(true, b); | ||
579 | return -ENOMEM; | ||
580 | } | ||
581 | |||
582 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); | ||
583 | |||
584 | if (cl && btree_node_dirty(b)) | ||
585 | bch_btree_write(b, true, NULL); | ||
586 | |||
587 | if (cl) | ||
588 | closure_wait_event_async(&b->io.wait, cl, | ||
589 | atomic_read(&b->io.cl.remaining) == -1); | ||
590 | |||
591 | if (btree_node_dirty(b) || | ||
592 | !closure_is_unlocked(&b->io.cl) || | ||
593 | work_pending(&b->work.work)) { | ||
594 | rw_unlock(true, b); | ||
595 | return -EAGAIN; | ||
596 | } | ||
597 | |||
598 | return 0; | ||
599 | } | ||
600 | |||
601 | static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) | ||
602 | { | ||
603 | struct cache_set *c = container_of(shrink, struct cache_set, shrink); | ||
604 | struct btree *b, *t; | ||
605 | unsigned long i, nr = sc->nr_to_scan; | ||
606 | |||
607 | if (c->shrinker_disabled) | ||
608 | return 0; | ||
609 | |||
610 | if (c->try_harder) | ||
611 | return 0; | ||
612 | |||
613 | /* | ||
614 | * If nr == 0, we're supposed to return the number of items we have | ||
615 | * cached. Not allowed to return -1. | ||
616 | */ | ||
617 | if (!nr) | ||
618 | return mca_can_free(c) * c->btree_pages; | ||
619 | |||
620 | /* Return -1 if we can't do anything right now */ | ||
621 | if (sc->gfp_mask & __GFP_WAIT) | ||
622 | mutex_lock(&c->bucket_lock); | ||
623 | else if (!mutex_trylock(&c->bucket_lock)) | ||
624 | return -1; | ||
625 | |||
626 | nr /= c->btree_pages; | ||
627 | nr = min_t(unsigned long, nr, mca_can_free(c)); | ||
628 | |||
629 | i = 0; | ||
630 | list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { | ||
631 | if (!nr) | ||
632 | break; | ||
633 | |||
634 | if (++i > 3 && | ||
635 | !mca_reap(b, NULL, 0)) { | ||
636 | mca_data_free(b); | ||
637 | rw_unlock(true, b); | ||
638 | --nr; | ||
639 | } | ||
640 | } | ||
641 | |||
642 | /* | ||
643 | * Can happen right when we first start up, before we've read in any | ||
644 | * btree nodes | ||
645 | */ | ||
646 | if (list_empty(&c->btree_cache)) | ||
647 | goto out; | ||
648 | |||
649 | for (i = 0; nr && i < c->bucket_cache_used; i++) { | ||
650 | b = list_first_entry(&c->btree_cache, struct btree, list); | ||
651 | list_rotate_left(&c->btree_cache); | ||
652 | |||
653 | if (!b->accessed && | ||
654 | !mca_reap(b, NULL, 0)) { | ||
655 | mca_bucket_free(b); | ||
656 | mca_data_free(b); | ||
657 | rw_unlock(true, b); | ||
658 | --nr; | ||
659 | } else | ||
660 | b->accessed = 0; | ||
661 | } | ||
662 | out: | ||
663 | nr = mca_can_free(c) * c->btree_pages; | ||
664 | mutex_unlock(&c->bucket_lock); | ||
665 | return nr; | ||
666 | } | ||
667 | |||
668 | void bch_btree_cache_free(struct cache_set *c) | ||
669 | { | ||
670 | struct btree *b; | ||
671 | struct closure cl; | ||
672 | closure_init_stack(&cl); | ||
673 | |||
674 | if (c->shrink.list.next) | ||
675 | unregister_shrinker(&c->shrink); | ||
676 | |||
677 | mutex_lock(&c->bucket_lock); | ||
678 | |||
679 | #ifdef CONFIG_BCACHE_DEBUG | ||
680 | if (c->verify_data) | ||
681 | list_move(&c->verify_data->list, &c->btree_cache); | ||
682 | #endif | ||
683 | |||
684 | list_splice(&c->btree_cache_freeable, | ||
685 | &c->btree_cache); | ||
686 | |||
687 | while (!list_empty(&c->btree_cache)) { | ||
688 | b = list_first_entry(&c->btree_cache, struct btree, list); | ||
689 | |||
690 | if (btree_node_dirty(b)) | ||
691 | btree_complete_write(b, btree_current_write(b)); | ||
692 | clear_bit(BTREE_NODE_dirty, &b->flags); | ||
693 | |||
694 | mca_data_free(b); | ||
695 | } | ||
696 | |||
697 | while (!list_empty(&c->btree_cache_freed)) { | ||
698 | b = list_first_entry(&c->btree_cache_freed, | ||
699 | struct btree, list); | ||
700 | list_del(&b->list); | ||
701 | cancel_delayed_work_sync(&b->work); | ||
702 | kfree(b); | ||
703 | } | ||
704 | |||
705 | mutex_unlock(&c->bucket_lock); | ||
706 | } | ||
707 | |||
708 | int bch_btree_cache_alloc(struct cache_set *c) | ||
709 | { | ||
710 | unsigned i; | ||
711 | |||
712 | /* XXX: doesn't check for errors */ | ||
713 | |||
714 | closure_init_unlocked(&c->gc); | ||
715 | |||
716 | for (i = 0; i < mca_reserve(c); i++) | ||
717 | mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); | ||
718 | |||
719 | list_splice_init(&c->btree_cache, | ||
720 | &c->btree_cache_freeable); | ||
721 | |||
722 | #ifdef CONFIG_BCACHE_DEBUG | ||
723 | mutex_init(&c->verify_lock); | ||
724 | |||
725 | c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); | ||
726 | |||
727 | if (c->verify_data && | ||
728 | c->verify_data->sets[0].data) | ||
729 | list_del_init(&c->verify_data->list); | ||
730 | else | ||
731 | c->verify_data = NULL; | ||
732 | #endif | ||
733 | |||
734 | c->shrink.shrink = bch_mca_shrink; | ||
735 | c->shrink.seeks = 4; | ||
736 | c->shrink.batch = c->btree_pages * 2; | ||
737 | register_shrinker(&c->shrink); | ||
738 | |||
739 | return 0; | ||
740 | } | ||
741 | |||
742 | /* Btree in memory cache - hash table */ | ||
743 | |||
744 | static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) | ||
745 | { | ||
746 | return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; | ||
747 | } | ||
748 | |||
749 | static struct btree *mca_find(struct cache_set *c, struct bkey *k) | ||
750 | { | ||
751 | struct btree *b; | ||
752 | |||
753 | rcu_read_lock(); | ||
754 | hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) | ||
755 | if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) | ||
756 | goto out; | ||
757 | b = NULL; | ||
758 | out: | ||
759 | rcu_read_unlock(); | ||
760 | return b; | ||
761 | } | ||
762 | |||
763 | static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, | ||
764 | int level, struct closure *cl) | ||
765 | { | ||
766 | int ret = -ENOMEM; | ||
767 | struct btree *i; | ||
768 | |||
769 | if (!cl) | ||
770 | return ERR_PTR(-ENOMEM); | ||
771 | |||
772 | /* | ||
773 | * Trying to free up some memory - i.e. reuse some btree nodes - may | ||
774 | * require initiating IO to flush the dirty part of the node. If we're | ||
775 | * running under generic_make_request(), that IO will never finish and | ||
776 | * we would deadlock. Returning -EAGAIN causes the cache lookup code to | ||
777 | * punt to workqueue and retry. | ||
778 | */ | ||
779 | if (current->bio_list) | ||
780 | return ERR_PTR(-EAGAIN); | ||
781 | |||
782 | if (c->try_harder && c->try_harder != cl) { | ||
783 | closure_wait_event_async(&c->try_wait, cl, !c->try_harder); | ||
784 | return ERR_PTR(-EAGAIN); | ||
785 | } | ||
786 | |||
787 | /* XXX: tracepoint */ | ||
788 | c->try_harder = cl; | ||
789 | c->try_harder_start = local_clock(); | ||
790 | retry: | ||
791 | list_for_each_entry_reverse(i, &c->btree_cache, list) { | ||
792 | int r = mca_reap(i, cl, btree_order(k)); | ||
793 | if (!r) | ||
794 | return i; | ||
795 | if (r != -ENOMEM) | ||
796 | ret = r; | ||
797 | } | ||
798 | |||
799 | if (ret == -EAGAIN && | ||
800 | closure_blocking(cl)) { | ||
801 | mutex_unlock(&c->bucket_lock); | ||
802 | closure_sync(cl); | ||
803 | mutex_lock(&c->bucket_lock); | ||
804 | goto retry; | ||
805 | } | ||
806 | |||
807 | return ERR_PTR(ret); | ||
808 | } | ||
809 | |||
810 | /* | ||
811 | * We can only have one thread cannibalizing other cached btree nodes at a time, | ||
812 | * or we'll deadlock. We use an open coded mutex to ensure that, which a | ||
813 | * cannibalize_bucket() will take. This means every time we unlock the root of | ||
814 | * the btree, we need to release this lock if we have it held. | ||
815 | */ | ||
816 | void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) | ||
817 | { | ||
818 | if (c->try_harder == cl) { | ||
819 | bch_time_stats_update(&c->try_harder_time, c->try_harder_start); | ||
820 | c->try_harder = NULL; | ||
821 | __closure_wake_up(&c->try_wait); | ||
822 | } | ||
823 | } | ||
824 | |||
825 | static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, | ||
826 | int level, struct closure *cl) | ||
827 | { | ||
828 | struct btree *b; | ||
829 | |||
830 | lockdep_assert_held(&c->bucket_lock); | ||
831 | |||
832 | if (mca_find(c, k)) | ||
833 | return NULL; | ||
834 | |||
835 | /* btree_free() doesn't free memory; it sticks the node on the end of | ||
836 | * the list. Check if there's any freed nodes there: | ||
837 | */ | ||
838 | list_for_each_entry(b, &c->btree_cache_freeable, list) | ||
839 | if (!mca_reap(b, NULL, btree_order(k))) | ||
840 | goto out; | ||
841 | |||
842 | /* We never free struct btree itself, just the memory that holds the on | ||
843 | * disk node. Check the freed list before allocating a new one: | ||
844 | */ | ||
845 | list_for_each_entry(b, &c->btree_cache_freed, list) | ||
846 | if (!mca_reap(b, NULL, 0)) { | ||
847 | mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); | ||
848 | if (!b->sets[0].data) | ||
849 | goto err; | ||
850 | else | ||
851 | goto out; | ||
852 | } | ||
853 | |||
854 | b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); | ||
855 | if (!b) | ||
856 | goto err; | ||
857 | |||
858 | BUG_ON(!down_write_trylock(&b->lock)); | ||
859 | if (!b->sets->data) | ||
860 | goto err; | ||
861 | out: | ||
862 | BUG_ON(!closure_is_unlocked(&b->io.cl)); | ||
863 | |||
864 | bkey_copy(&b->key, k); | ||
865 | list_move(&b->list, &c->btree_cache); | ||
866 | hlist_del_init_rcu(&b->hash); | ||
867 | hlist_add_head_rcu(&b->hash, mca_hash(c, k)); | ||
868 | |||
869 | lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); | ||
870 | b->level = level; | ||
871 | |||
872 | mca_reinit(b); | ||
873 | |||
874 | return b; | ||
875 | err: | ||
876 | if (b) | ||
877 | rw_unlock(true, b); | ||
878 | |||
879 | b = mca_cannibalize(c, k, level, cl); | ||
880 | if (!IS_ERR(b)) | ||
881 | goto out; | ||
882 | |||
883 | return b; | ||
884 | } | ||
885 | |||
886 | /** | ||
887 | * bch_btree_node_get - find a btree node in the cache and lock it, reading it | ||
888 | * in from disk if necessary. | ||
889 | * | ||
890 | * If IO is necessary, it uses the closure embedded in struct btree_op to wait; | ||
891 | * if that closure is in non blocking mode, will return -EAGAIN. | ||
892 | * | ||
893 | * The btree node will have either a read or a write lock held, depending on | ||
894 | * level and op->lock. | ||
895 | */ | ||
896 | struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, | ||
897 | int level, struct btree_op *op) | ||
898 | { | ||
899 | int i = 0; | ||
900 | bool write = level <= op->lock; | ||
901 | struct btree *b; | ||
902 | |||
903 | BUG_ON(level < 0); | ||
904 | retry: | ||
905 | b = mca_find(c, k); | ||
906 | |||
907 | if (!b) { | ||
908 | mutex_lock(&c->bucket_lock); | ||
909 | b = mca_alloc(c, k, level, &op->cl); | ||
910 | mutex_unlock(&c->bucket_lock); | ||
911 | |||
912 | if (!b) | ||
913 | goto retry; | ||
914 | if (IS_ERR(b)) | ||
915 | return b; | ||
916 | |||
917 | bch_btree_read(b); | ||
918 | |||
919 | if (!write) | ||
920 | downgrade_write(&b->lock); | ||
921 | } else { | ||
922 | rw_lock(write, b, level); | ||
923 | if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { | ||
924 | rw_unlock(write, b); | ||
925 | goto retry; | ||
926 | } | ||
927 | BUG_ON(b->level != level); | ||
928 | } | ||
929 | |||
930 | b->accessed = 1; | ||
931 | |||
932 | for (; i <= b->nsets && b->sets[i].size; i++) { | ||
933 | prefetch(b->sets[i].tree); | ||
934 | prefetch(b->sets[i].data); | ||
935 | } | ||
936 | |||
937 | for (; i <= b->nsets; i++) | ||
938 | prefetch(b->sets[i].data); | ||
939 | |||
940 | if (!closure_wait_event(&b->io.wait, &op->cl, | ||
941 | btree_node_read_done(b))) { | ||
942 | rw_unlock(write, b); | ||
943 | b = ERR_PTR(-EAGAIN); | ||
944 | } else if (btree_node_io_error(b)) { | ||
945 | rw_unlock(write, b); | ||
946 | b = ERR_PTR(-EIO); | ||
947 | } else | ||
948 | BUG_ON(!b->written); | ||
949 | |||
950 | return b; | ||
951 | } | ||
952 | |||
953 | static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) | ||
954 | { | ||
955 | struct btree *b; | ||
956 | |||
957 | mutex_lock(&c->bucket_lock); | ||
958 | b = mca_alloc(c, k, level, NULL); | ||
959 | mutex_unlock(&c->bucket_lock); | ||
960 | |||
961 | if (!IS_ERR_OR_NULL(b)) { | ||
962 | bch_btree_read(b); | ||
963 | rw_unlock(true, b); | ||
964 | } | ||
965 | } | ||
966 | |||
967 | /* Btree alloc */ | ||
968 | |||
969 | static void btree_node_free(struct btree *b, struct btree_op *op) | ||
970 | { | ||
971 | unsigned i; | ||
972 | |||
973 | /* | ||
974 | * The BUG_ON() in btree_node_get() implies that we must have a write | ||
975 | * lock on parent to free or even invalidate a node | ||
976 | */ | ||
977 | BUG_ON(op->lock <= b->level); | ||
978 | BUG_ON(b == b->c->root); | ||
979 | pr_debug("bucket %s", pbtree(b)); | ||
980 | |||
981 | if (btree_node_dirty(b)) | ||
982 | btree_complete_write(b, btree_current_write(b)); | ||
983 | clear_bit(BTREE_NODE_dirty, &b->flags); | ||
984 | |||
985 | if (b->prio_blocked && | ||
986 | !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked)) | ||
987 | wake_up(&b->c->alloc_wait); | ||
988 | |||
989 | b->prio_blocked = 0; | ||
990 | |||
991 | cancel_delayed_work(&b->work); | ||
992 | |||
993 | mutex_lock(&b->c->bucket_lock); | ||
994 | |||
995 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | ||
996 | BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); | ||
997 | |||
998 | bch_inc_gen(PTR_CACHE(b->c, &b->key, i), | ||
999 | PTR_BUCKET(b->c, &b->key, i)); | ||
1000 | } | ||
1001 | |||
1002 | bch_bucket_free(b->c, &b->key); | ||
1003 | mca_bucket_free(b); | ||
1004 | mutex_unlock(&b->c->bucket_lock); | ||
1005 | } | ||
1006 | |||
1007 | struct btree *bch_btree_node_alloc(struct cache_set *c, int level, | ||
1008 | struct closure *cl) | ||
1009 | { | ||
1010 | BKEY_PADDED(key) k; | ||
1011 | struct btree *b = ERR_PTR(-EAGAIN); | ||
1012 | |||
1013 | mutex_lock(&c->bucket_lock); | ||
1014 | retry: | ||
1015 | if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) | ||
1016 | goto err; | ||
1017 | |||
1018 | SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); | ||
1019 | |||
1020 | b = mca_alloc(c, &k.key, level, cl); | ||
1021 | if (IS_ERR(b)) | ||
1022 | goto err_free; | ||
1023 | |||
1024 | if (!b) { | ||
1025 | cache_bug(c, | ||
1026 | "Tried to allocate bucket that was in btree cache"); | ||
1027 | __bkey_put(c, &k.key); | ||
1028 | goto retry; | ||
1029 | } | ||
1030 | |||
1031 | set_btree_node_read_done(b); | ||
1032 | b->accessed = 1; | ||
1033 | bch_bset_init_next(b); | ||
1034 | |||
1035 | mutex_unlock(&c->bucket_lock); | ||
1036 | return b; | ||
1037 | err_free: | ||
1038 | bch_bucket_free(c, &k.key); | ||
1039 | __bkey_put(c, &k.key); | ||
1040 | err: | ||
1041 | mutex_unlock(&c->bucket_lock); | ||
1042 | return b; | ||
1043 | } | ||
1044 | |||
1045 | static struct btree *btree_node_alloc_replacement(struct btree *b, | ||
1046 | struct closure *cl) | ||
1047 | { | ||
1048 | struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); | ||
1049 | if (!IS_ERR_OR_NULL(n)) | ||
1050 | bch_btree_sort_into(b, n); | ||
1051 | |||
1052 | return n; | ||
1053 | } | ||
1054 | |||
1055 | /* Garbage collection */ | ||
1056 | |||
1057 | uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | ||
1058 | { | ||
1059 | uint8_t stale = 0; | ||
1060 | unsigned i; | ||
1061 | struct bucket *g; | ||
1062 | |||
1063 | /* | ||
1064 | * ptr_invalid() can't return true for the keys that mark btree nodes as | ||
1065 | * freed, but since ptr_bad() returns true we'll never actually use them | ||
1066 | * for anything and thus we don't want mark their pointers here | ||
1067 | */ | ||
1068 | if (!bkey_cmp(k, &ZERO_KEY)) | ||
1069 | return stale; | ||
1070 | |||
1071 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
1072 | if (!ptr_available(c, k, i)) | ||
1073 | continue; | ||
1074 | |||
1075 | g = PTR_BUCKET(c, k, i); | ||
1076 | |||
1077 | if (gen_after(g->gc_gen, PTR_GEN(k, i))) | ||
1078 | g->gc_gen = PTR_GEN(k, i); | ||
1079 | |||
1080 | if (ptr_stale(c, k, i)) { | ||
1081 | stale = max(stale, ptr_stale(c, k, i)); | ||
1082 | continue; | ||
1083 | } | ||
1084 | |||
1085 | cache_bug_on(GC_MARK(g) && | ||
1086 | (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), | ||
1087 | c, "inconsistent ptrs: mark = %llu, level = %i", | ||
1088 | GC_MARK(g), level); | ||
1089 | |||
1090 | if (level) | ||
1091 | SET_GC_MARK(g, GC_MARK_METADATA); | ||
1092 | else if (KEY_DIRTY(k)) | ||
1093 | SET_GC_MARK(g, GC_MARK_DIRTY); | ||
1094 | |||
1095 | /* guard against overflow */ | ||
1096 | SET_GC_SECTORS_USED(g, min_t(unsigned, | ||
1097 | GC_SECTORS_USED(g) + KEY_SIZE(k), | ||
1098 | (1 << 14) - 1)); | ||
1099 | |||
1100 | BUG_ON(!GC_SECTORS_USED(g)); | ||
1101 | } | ||
1102 | |||
1103 | return stale; | ||
1104 | } | ||
1105 | |||
1106 | #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) | ||
1107 | |||
1108 | static int btree_gc_mark_node(struct btree *b, unsigned *keys, | ||
1109 | struct gc_stat *gc) | ||
1110 | { | ||
1111 | uint8_t stale = 0; | ||
1112 | unsigned last_dev = -1; | ||
1113 | struct bcache_device *d = NULL; | ||
1114 | struct bkey *k; | ||
1115 | struct btree_iter iter; | ||
1116 | struct bset_tree *t; | ||
1117 | |||
1118 | gc->nodes++; | ||
1119 | |||
1120 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { | ||
1121 | if (last_dev != KEY_INODE(k)) { | ||
1122 | last_dev = KEY_INODE(k); | ||
1123 | |||
1124 | d = KEY_INODE(k) < b->c->nr_uuids | ||
1125 | ? b->c->devices[last_dev] | ||
1126 | : NULL; | ||
1127 | } | ||
1128 | |||
1129 | stale = max(stale, btree_mark_key(b, k)); | ||
1130 | |||
1131 | if (bch_ptr_bad(b, k)) | ||
1132 | continue; | ||
1133 | |||
1134 | *keys += bkey_u64s(k); | ||
1135 | |||
1136 | gc->key_bytes += bkey_u64s(k); | ||
1137 | gc->nkeys++; | ||
1138 | |||
1139 | gc->data += KEY_SIZE(k); | ||
1140 | if (KEY_DIRTY(k)) { | ||
1141 | gc->dirty += KEY_SIZE(k); | ||
1142 | if (d) | ||
1143 | d->sectors_dirty_gc += KEY_SIZE(k); | ||
1144 | } | ||
1145 | } | ||
1146 | |||
1147 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) | ||
1148 | btree_bug_on(t->size && | ||
1149 | bset_written(b, t) && | ||
1150 | bkey_cmp(&b->key, &t->end) < 0, | ||
1151 | b, "found short btree key in gc"); | ||
1152 | |||
1153 | return stale; | ||
1154 | } | ||
1155 | |||
1156 | static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, | ||
1157 | struct btree_op *op) | ||
1158 | { | ||
1159 | /* | ||
1160 | * We block priorities from being written for the duration of garbage | ||
1161 | * collection, so we can't sleep in btree_alloc() -> | ||
1162 | * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it | ||
1163 | * our closure. | ||
1164 | */ | ||
1165 | struct btree *n = btree_node_alloc_replacement(b, NULL); | ||
1166 | |||
1167 | if (!IS_ERR_OR_NULL(n)) { | ||
1168 | swap(b, n); | ||
1169 | |||
1170 | memcpy(k->ptr, b->key.ptr, | ||
1171 | sizeof(uint64_t) * KEY_PTRS(&b->key)); | ||
1172 | |||
1173 | __bkey_put(b->c, &b->key); | ||
1174 | atomic_inc(&b->c->prio_blocked); | ||
1175 | b->prio_blocked++; | ||
1176 | |||
1177 | btree_node_free(n, op); | ||
1178 | up_write(&n->lock); | ||
1179 | } | ||
1180 | |||
1181 | return b; | ||
1182 | } | ||
1183 | |||
1184 | /* | ||
1185 | * Leaving this at 2 until we've got incremental garbage collection done; it | ||
1186 | * could be higher (and has been tested with 4) except that garbage collection | ||
1187 | * could take much longer, adversely affecting latency. | ||
1188 | */ | ||
1189 | #define GC_MERGE_NODES 2U | ||
1190 | |||
1191 | struct gc_merge_info { | ||
1192 | struct btree *b; | ||
1193 | struct bkey *k; | ||
1194 | unsigned keys; | ||
1195 | }; | ||
1196 | |||
1197 | static void btree_gc_coalesce(struct btree *b, struct btree_op *op, | ||
1198 | struct gc_stat *gc, struct gc_merge_info *r) | ||
1199 | { | ||
1200 | unsigned nodes = 0, keys = 0, blocks; | ||
1201 | int i; | ||
1202 | |||
1203 | while (nodes < GC_MERGE_NODES && r[nodes].b) | ||
1204 | keys += r[nodes++].keys; | ||
1205 | |||
1206 | blocks = btree_default_blocks(b->c) * 2 / 3; | ||
1207 | |||
1208 | if (nodes < 2 || | ||
1209 | __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) | ||
1210 | return; | ||
1211 | |||
1212 | for (i = nodes - 1; i >= 0; --i) { | ||
1213 | if (r[i].b->written) | ||
1214 | r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); | ||
1215 | |||
1216 | if (r[i].b->written) | ||
1217 | return; | ||
1218 | } | ||
1219 | |||
1220 | for (i = nodes - 1; i > 0; --i) { | ||
1221 | struct bset *n1 = r[i].b->sets->data; | ||
1222 | struct bset *n2 = r[i - 1].b->sets->data; | ||
1223 | struct bkey *k, *last = NULL; | ||
1224 | |||
1225 | keys = 0; | ||
1226 | |||
1227 | if (i == 1) { | ||
1228 | /* | ||
1229 | * Last node we're not getting rid of - we're getting | ||
1230 | * rid of the node at r[0]. Have to try and fit all of | ||
1231 | * the remaining keys into this node; we can't ensure | ||
1232 | * they will always fit due to rounding and variable | ||
1233 | * length keys (shouldn't be possible in practice, | ||
1234 | * though) | ||
1235 | */ | ||
1236 | if (__set_blocks(n1, n1->keys + r->keys, | ||
1237 | b->c) > btree_blocks(r[i].b)) | ||
1238 | return; | ||
1239 | |||
1240 | keys = n2->keys; | ||
1241 | last = &r->b->key; | ||
1242 | } else | ||
1243 | for (k = n2->start; | ||
1244 | k < end(n2); | ||
1245 | k = bkey_next(k)) { | ||
1246 | if (__set_blocks(n1, n1->keys + keys + | ||
1247 | bkey_u64s(k), b->c) > blocks) | ||
1248 | break; | ||
1249 | |||
1250 | last = k; | ||
1251 | keys += bkey_u64s(k); | ||
1252 | } | ||
1253 | |||
1254 | BUG_ON(__set_blocks(n1, n1->keys + keys, | ||
1255 | b->c) > btree_blocks(r[i].b)); | ||
1256 | |||
1257 | if (last) { | ||
1258 | bkey_copy_key(&r[i].b->key, last); | ||
1259 | bkey_copy_key(r[i].k, last); | ||
1260 | } | ||
1261 | |||
1262 | memcpy(end(n1), | ||
1263 | n2->start, | ||
1264 | (void *) node(n2, keys) - (void *) n2->start); | ||
1265 | |||
1266 | n1->keys += keys; | ||
1267 | |||
1268 | memmove(n2->start, | ||
1269 | node(n2, keys), | ||
1270 | (void *) end(n2) - (void *) node(n2, keys)); | ||
1271 | |||
1272 | n2->keys -= keys; | ||
1273 | |||
1274 | r[i].keys = n1->keys; | ||
1275 | r[i - 1].keys = n2->keys; | ||
1276 | } | ||
1277 | |||
1278 | btree_node_free(r->b, op); | ||
1279 | up_write(&r->b->lock); | ||
1280 | |||
1281 | pr_debug("coalesced %u nodes", nodes); | ||
1282 | |||
1283 | gc->nodes--; | ||
1284 | nodes--; | ||
1285 | |||
1286 | memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); | ||
1287 | memset(&r[nodes], 0, sizeof(struct gc_merge_info)); | ||
1288 | } | ||
1289 | |||
1290 | static int btree_gc_recurse(struct btree *b, struct btree_op *op, | ||
1291 | struct closure *writes, struct gc_stat *gc) | ||
1292 | { | ||
1293 | void write(struct btree *r) | ||
1294 | { | ||
1295 | if (!r->written) | ||
1296 | bch_btree_write(r, true, op); | ||
1297 | else if (btree_node_dirty(r)) { | ||
1298 | BUG_ON(btree_current_write(r)->owner); | ||
1299 | btree_current_write(r)->owner = writes; | ||
1300 | closure_get(writes); | ||
1301 | |||
1302 | bch_btree_write(r, true, NULL); | ||
1303 | } | ||
1304 | |||
1305 | up_write(&r->lock); | ||
1306 | } | ||
1307 | |||
1308 | int ret = 0, stale; | ||
1309 | unsigned i; | ||
1310 | struct gc_merge_info r[GC_MERGE_NODES]; | ||
1311 | |||
1312 | memset(r, 0, sizeof(r)); | ||
1313 | |||
1314 | while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { | ||
1315 | r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); | ||
1316 | |||
1317 | if (IS_ERR(r->b)) { | ||
1318 | ret = PTR_ERR(r->b); | ||
1319 | break; | ||
1320 | } | ||
1321 | |||
1322 | r->keys = 0; | ||
1323 | stale = btree_gc_mark_node(r->b, &r->keys, gc); | ||
1324 | |||
1325 | if (!b->written && | ||
1326 | (r->b->level || stale > 10 || | ||
1327 | b->c->gc_always_rewrite)) | ||
1328 | r->b = btree_gc_alloc(r->b, r->k, op); | ||
1329 | |||
1330 | if (r->b->level) | ||
1331 | ret = btree_gc_recurse(r->b, op, writes, gc); | ||
1332 | |||
1333 | if (ret) { | ||
1334 | write(r->b); | ||
1335 | break; | ||
1336 | } | ||
1337 | |||
1338 | bkey_copy_key(&b->c->gc_done, r->k); | ||
1339 | |||
1340 | if (!b->written) | ||
1341 | btree_gc_coalesce(b, op, gc, r); | ||
1342 | |||
1343 | if (r[GC_MERGE_NODES - 1].b) | ||
1344 | write(r[GC_MERGE_NODES - 1].b); | ||
1345 | |||
1346 | memmove(&r[1], &r[0], | ||
1347 | sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); | ||
1348 | |||
1349 | /* When we've got incremental GC working, we'll want to do | ||
1350 | * if (should_resched()) | ||
1351 | * return -EAGAIN; | ||
1352 | */ | ||
1353 | cond_resched(); | ||
1354 | #if 0 | ||
1355 | if (need_resched()) { | ||
1356 | ret = -EAGAIN; | ||
1357 | break; | ||
1358 | } | ||
1359 | #endif | ||
1360 | } | ||
1361 | |||
1362 | for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) | ||
1363 | write(r[i].b); | ||
1364 | |||
1365 | /* Might have freed some children, must remove their keys */ | ||
1366 | if (!b->written) | ||
1367 | bch_btree_sort(b); | ||
1368 | |||
1369 | return ret; | ||
1370 | } | ||
1371 | |||
1372 | static int bch_btree_gc_root(struct btree *b, struct btree_op *op, | ||
1373 | struct closure *writes, struct gc_stat *gc) | ||
1374 | { | ||
1375 | struct btree *n = NULL; | ||
1376 | unsigned keys = 0; | ||
1377 | int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); | ||
1378 | |||
1379 | if (b->level || stale > 10) | ||
1380 | n = btree_node_alloc_replacement(b, NULL); | ||
1381 | |||
1382 | if (!IS_ERR_OR_NULL(n)) | ||
1383 | swap(b, n); | ||
1384 | |||
1385 | if (b->level) | ||
1386 | ret = btree_gc_recurse(b, op, writes, gc); | ||
1387 | |||
1388 | if (!b->written || btree_node_dirty(b)) { | ||
1389 | atomic_inc(&b->c->prio_blocked); | ||
1390 | b->prio_blocked++; | ||
1391 | bch_btree_write(b, true, n ? op : NULL); | ||
1392 | } | ||
1393 | |||
1394 | if (!IS_ERR_OR_NULL(n)) { | ||
1395 | closure_sync(&op->cl); | ||
1396 | bch_btree_set_root(b); | ||
1397 | btree_node_free(n, op); | ||
1398 | rw_unlock(true, b); | ||
1399 | } | ||
1400 | |||
1401 | return ret; | ||
1402 | } | ||
1403 | |||
1404 | static void btree_gc_start(struct cache_set *c) | ||
1405 | { | ||
1406 | struct cache *ca; | ||
1407 | struct bucket *b; | ||
1408 | struct bcache_device **d; | ||
1409 | unsigned i; | ||
1410 | |||
1411 | if (!c->gc_mark_valid) | ||
1412 | return; | ||
1413 | |||
1414 | mutex_lock(&c->bucket_lock); | ||
1415 | |||
1416 | c->gc_mark_valid = 0; | ||
1417 | c->gc_done = ZERO_KEY; | ||
1418 | |||
1419 | for_each_cache(ca, c, i) | ||
1420 | for_each_bucket(b, ca) { | ||
1421 | b->gc_gen = b->gen; | ||
1422 | if (!atomic_read(&b->pin)) | ||
1423 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | ||
1424 | } | ||
1425 | |||
1426 | for (d = c->devices; | ||
1427 | d < c->devices + c->nr_uuids; | ||
1428 | d++) | ||
1429 | if (*d) | ||
1430 | (*d)->sectors_dirty_gc = 0; | ||
1431 | |||
1432 | mutex_unlock(&c->bucket_lock); | ||
1433 | } | ||
1434 | |||
1435 | size_t bch_btree_gc_finish(struct cache_set *c) | ||
1436 | { | ||
1437 | size_t available = 0; | ||
1438 | struct bucket *b; | ||
1439 | struct cache *ca; | ||
1440 | struct bcache_device **d; | ||
1441 | unsigned i; | ||
1442 | |||
1443 | mutex_lock(&c->bucket_lock); | ||
1444 | |||
1445 | set_gc_sectors(c); | ||
1446 | c->gc_mark_valid = 1; | ||
1447 | c->need_gc = 0; | ||
1448 | |||
1449 | if (c->root) | ||
1450 | for (i = 0; i < KEY_PTRS(&c->root->key); i++) | ||
1451 | SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), | ||
1452 | GC_MARK_METADATA); | ||
1453 | |||
1454 | for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) | ||
1455 | SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), | ||
1456 | GC_MARK_METADATA); | ||
1457 | |||
1458 | for_each_cache(ca, c, i) { | ||
1459 | uint64_t *i; | ||
1460 | |||
1461 | ca->invalidate_needs_gc = 0; | ||
1462 | |||
1463 | for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) | ||
1464 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); | ||
1465 | |||
1466 | for (i = ca->prio_buckets; | ||
1467 | i < ca->prio_buckets + prio_buckets(ca) * 2; i++) | ||
1468 | SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); | ||
1469 | |||
1470 | for_each_bucket(b, ca) { | ||
1471 | b->last_gc = b->gc_gen; | ||
1472 | c->need_gc = max(c->need_gc, bucket_gc_gen(b)); | ||
1473 | |||
1474 | if (!atomic_read(&b->pin) && | ||
1475 | GC_MARK(b) == GC_MARK_RECLAIMABLE) { | ||
1476 | available++; | ||
1477 | if (!GC_SECTORS_USED(b)) | ||
1478 | bch_bucket_add_unused(ca, b); | ||
1479 | } | ||
1480 | } | ||
1481 | } | ||
1482 | |||
1483 | for (d = c->devices; | ||
1484 | d < c->devices + c->nr_uuids; | ||
1485 | d++) | ||
1486 | if (*d) { | ||
1487 | unsigned long last = | ||
1488 | atomic_long_read(&((*d)->sectors_dirty)); | ||
1489 | long difference = (*d)->sectors_dirty_gc - last; | ||
1490 | |||
1491 | pr_debug("sectors dirty off by %li", difference); | ||
1492 | |||
1493 | (*d)->sectors_dirty_last += difference; | ||
1494 | |||
1495 | atomic_long_set(&((*d)->sectors_dirty), | ||
1496 | (*d)->sectors_dirty_gc); | ||
1497 | } | ||
1498 | |||
1499 | mutex_unlock(&c->bucket_lock); | ||
1500 | return available; | ||
1501 | } | ||
1502 | |||
1503 | static void bch_btree_gc(struct closure *cl) | ||
1504 | { | ||
1505 | struct cache_set *c = container_of(cl, struct cache_set, gc.cl); | ||
1506 | int ret; | ||
1507 | unsigned long available; | ||
1508 | struct gc_stat stats; | ||
1509 | struct closure writes; | ||
1510 | struct btree_op op; | ||
1511 | |||
1512 | uint64_t start_time = local_clock(); | ||
1513 | trace_bcache_gc_start(c->sb.set_uuid); | ||
1514 | blktrace_msg_all(c, "Starting gc"); | ||
1515 | |||
1516 | memset(&stats, 0, sizeof(struct gc_stat)); | ||
1517 | closure_init_stack(&writes); | ||
1518 | bch_btree_op_init_stack(&op); | ||
1519 | op.lock = SHRT_MAX; | ||
1520 | |||
1521 | btree_gc_start(c); | ||
1522 | |||
1523 | ret = btree_root(gc_root, c, &op, &writes, &stats); | ||
1524 | closure_sync(&op.cl); | ||
1525 | closure_sync(&writes); | ||
1526 | |||
1527 | if (ret) { | ||
1528 | blktrace_msg_all(c, "Stopped gc"); | ||
1529 | pr_warn("gc failed!"); | ||
1530 | |||
1531 | continue_at(cl, bch_btree_gc, bch_gc_wq); | ||
1532 | } | ||
1533 | |||
1534 | /* Possibly wait for new UUIDs or whatever to hit disk */ | ||
1535 | bch_journal_meta(c, &op.cl); | ||
1536 | closure_sync(&op.cl); | ||
1537 | |||
1538 | available = bch_btree_gc_finish(c); | ||
1539 | |||
1540 | bch_time_stats_update(&c->btree_gc_time, start_time); | ||
1541 | |||
1542 | stats.key_bytes *= sizeof(uint64_t); | ||
1543 | stats.dirty <<= 9; | ||
1544 | stats.data <<= 9; | ||
1545 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; | ||
1546 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); | ||
1547 | blktrace_msg_all(c, "Finished gc"); | ||
1548 | |||
1549 | trace_bcache_gc_end(c->sb.set_uuid); | ||
1550 | wake_up(&c->alloc_wait); | ||
1551 | |||
1552 | continue_at(cl, bch_moving_gc, bch_gc_wq); | ||
1553 | } | ||
1554 | |||
1555 | void bch_queue_gc(struct cache_set *c) | ||
1556 | { | ||
1557 | closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); | ||
1558 | } | ||
1559 | |||
1560 | /* Initial partial gc */ | ||
1561 | |||
1562 | static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, | ||
1563 | unsigned long **seen) | ||
1564 | { | ||
1565 | int ret; | ||
1566 | unsigned i; | ||
1567 | struct bkey *k; | ||
1568 | struct bucket *g; | ||
1569 | struct btree_iter iter; | ||
1570 | |||
1571 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { | ||
1572 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
1573 | if (!ptr_available(b->c, k, i)) | ||
1574 | continue; | ||
1575 | |||
1576 | g = PTR_BUCKET(b->c, k, i); | ||
1577 | |||
1578 | if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), | ||
1579 | seen[PTR_DEV(k, i)]) || | ||
1580 | !ptr_stale(b->c, k, i)) { | ||
1581 | g->gen = PTR_GEN(k, i); | ||
1582 | |||
1583 | if (b->level) | ||
1584 | g->prio = BTREE_PRIO; | ||
1585 | else if (g->prio == BTREE_PRIO) | ||
1586 | g->prio = INITIAL_PRIO; | ||
1587 | } | ||
1588 | } | ||
1589 | |||
1590 | btree_mark_key(b, k); | ||
1591 | } | ||
1592 | |||
1593 | if (b->level) { | ||
1594 | k = bch_next_recurse_key(b, &ZERO_KEY); | ||
1595 | |||
1596 | while (k) { | ||
1597 | struct bkey *p = bch_next_recurse_key(b, k); | ||
1598 | if (p) | ||
1599 | btree_node_prefetch(b->c, p, b->level - 1); | ||
1600 | |||
1601 | ret = btree(check_recurse, k, b, op, seen); | ||
1602 | if (ret) | ||
1603 | return ret; | ||
1604 | |||
1605 | k = p; | ||
1606 | } | ||
1607 | } | ||
1608 | |||
1609 | return 0; | ||
1610 | } | ||
1611 | |||
1612 | int bch_btree_check(struct cache_set *c, struct btree_op *op) | ||
1613 | { | ||
1614 | int ret = -ENOMEM; | ||
1615 | unsigned i; | ||
1616 | unsigned long *seen[MAX_CACHES_PER_SET]; | ||
1617 | |||
1618 | memset(seen, 0, sizeof(seen)); | ||
1619 | |||
1620 | for (i = 0; c->cache[i]; i++) { | ||
1621 | size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); | ||
1622 | seen[i] = kmalloc(n, GFP_KERNEL); | ||
1623 | if (!seen[i]) | ||
1624 | goto err; | ||
1625 | |||
1626 | /* Disables the seen array until prio_read() uses it too */ | ||
1627 | memset(seen[i], 0xFF, n); | ||
1628 | } | ||
1629 | |||
1630 | ret = btree_root(check_recurse, c, op, seen); | ||
1631 | err: | ||
1632 | for (i = 0; i < MAX_CACHES_PER_SET; i++) | ||
1633 | kfree(seen[i]); | ||
1634 | return ret; | ||
1635 | } | ||
1636 | |||
1637 | /* Btree insertion */ | ||
1638 | |||
1639 | static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) | ||
1640 | { | ||
1641 | struct bset *i = b->sets[b->nsets].data; | ||
1642 | |||
1643 | memmove((uint64_t *) where + bkey_u64s(insert), | ||
1644 | where, | ||
1645 | (void *) end(i) - (void *) where); | ||
1646 | |||
1647 | i->keys += bkey_u64s(insert); | ||
1648 | bkey_copy(where, insert); | ||
1649 | bch_bset_fix_lookup_table(b, where); | ||
1650 | } | ||
1651 | |||
1652 | static bool fix_overlapping_extents(struct btree *b, | ||
1653 | struct bkey *insert, | ||
1654 | struct btree_iter *iter, | ||
1655 | struct btree_op *op) | ||
1656 | { | ||
1657 | void subtract_dirty(struct bkey *k, int sectors) | ||
1658 | { | ||
1659 | struct bcache_device *d = b->c->devices[KEY_INODE(k)]; | ||
1660 | |||
1661 | if (KEY_DIRTY(k) && d) | ||
1662 | atomic_long_sub(sectors, &d->sectors_dirty); | ||
1663 | } | ||
1664 | |||
1665 | unsigned old_size, sectors_found = 0; | ||
1666 | |||
1667 | while (1) { | ||
1668 | struct bkey *k = bch_btree_iter_next(iter); | ||
1669 | if (!k || | ||
1670 | bkey_cmp(&START_KEY(k), insert) >= 0) | ||
1671 | break; | ||
1672 | |||
1673 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) | ||
1674 | continue; | ||
1675 | |||
1676 | old_size = KEY_SIZE(k); | ||
1677 | |||
1678 | /* | ||
1679 | * We might overlap with 0 size extents; we can't skip these | ||
1680 | * because if they're in the set we're inserting to we have to | ||
1681 | * adjust them so they don't overlap with the key we're | ||
1682 | * inserting. But we don't want to check them for BTREE_REPLACE | ||
1683 | * operations. | ||
1684 | */ | ||
1685 | |||
1686 | if (op->type == BTREE_REPLACE && | ||
1687 | KEY_SIZE(k)) { | ||
1688 | /* | ||
1689 | * k might have been split since we inserted/found the | ||
1690 | * key we're replacing | ||
1691 | */ | ||
1692 | unsigned i; | ||
1693 | uint64_t offset = KEY_START(k) - | ||
1694 | KEY_START(&op->replace); | ||
1695 | |||
1696 | /* But it must be a subset of the replace key */ | ||
1697 | if (KEY_START(k) < KEY_START(&op->replace) || | ||
1698 | KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) | ||
1699 | goto check_failed; | ||
1700 | |||
1701 | /* We didn't find a key that we were supposed to */ | ||
1702 | if (KEY_START(k) > KEY_START(insert) + sectors_found) | ||
1703 | goto check_failed; | ||
1704 | |||
1705 | if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) | ||
1706 | goto check_failed; | ||
1707 | |||
1708 | /* skip past gen */ | ||
1709 | offset <<= 8; | ||
1710 | |||
1711 | BUG_ON(!KEY_PTRS(&op->replace)); | ||
1712 | |||
1713 | for (i = 0; i < KEY_PTRS(&op->replace); i++) | ||
1714 | if (k->ptr[i] != op->replace.ptr[i] + offset) | ||
1715 | goto check_failed; | ||
1716 | |||
1717 | sectors_found = KEY_OFFSET(k) - KEY_START(insert); | ||
1718 | } | ||
1719 | |||
1720 | if (bkey_cmp(insert, k) < 0 && | ||
1721 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { | ||
1722 | /* | ||
1723 | * We overlapped in the middle of an existing key: that | ||
1724 | * means we have to split the old key. But we have to do | ||
1725 | * slightly different things depending on whether the | ||
1726 | * old key has been written out yet. | ||
1727 | */ | ||
1728 | |||
1729 | struct bkey *top; | ||
1730 | |||
1731 | subtract_dirty(k, KEY_SIZE(insert)); | ||
1732 | |||
1733 | if (bkey_written(b, k)) { | ||
1734 | /* | ||
1735 | * We insert a new key to cover the top of the | ||
1736 | * old key, and the old key is modified in place | ||
1737 | * to represent the bottom split. | ||
1738 | * | ||
1739 | * It's completely arbitrary whether the new key | ||
1740 | * is the top or the bottom, but it has to match | ||
1741 | * up with what btree_sort_fixup() does - it | ||
1742 | * doesn't check for this kind of overlap, it | ||
1743 | * depends on us inserting a new key for the top | ||
1744 | * here. | ||
1745 | */ | ||
1746 | top = bch_bset_search(b, &b->sets[b->nsets], | ||
1747 | insert); | ||
1748 | shift_keys(b, top, k); | ||
1749 | } else { | ||
1750 | BKEY_PADDED(key) temp; | ||
1751 | bkey_copy(&temp.key, k); | ||
1752 | shift_keys(b, k, &temp.key); | ||
1753 | top = bkey_next(k); | ||
1754 | } | ||
1755 | |||
1756 | bch_cut_front(insert, top); | ||
1757 | bch_cut_back(&START_KEY(insert), k); | ||
1758 | bch_bset_fix_invalidated_key(b, k); | ||
1759 | return false; | ||
1760 | } | ||
1761 | |||
1762 | if (bkey_cmp(insert, k) < 0) { | ||
1763 | bch_cut_front(insert, k); | ||
1764 | } else { | ||
1765 | if (bkey_written(b, k) && | ||
1766 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { | ||
1767 | /* | ||
1768 | * Completely overwrote, so we don't have to | ||
1769 | * invalidate the binary search tree | ||
1770 | */ | ||
1771 | bch_cut_front(k, k); | ||
1772 | } else { | ||
1773 | __bch_cut_back(&START_KEY(insert), k); | ||
1774 | bch_bset_fix_invalidated_key(b, k); | ||
1775 | } | ||
1776 | } | ||
1777 | |||
1778 | subtract_dirty(k, old_size - KEY_SIZE(k)); | ||
1779 | } | ||
1780 | |||
1781 | check_failed: | ||
1782 | if (op->type == BTREE_REPLACE) { | ||
1783 | if (!sectors_found) { | ||
1784 | op->insert_collision = true; | ||
1785 | return true; | ||
1786 | } else if (sectors_found < KEY_SIZE(insert)) { | ||
1787 | SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - | ||
1788 | (KEY_SIZE(insert) - sectors_found)); | ||
1789 | SET_KEY_SIZE(insert, sectors_found); | ||
1790 | } | ||
1791 | } | ||
1792 | |||
1793 | return false; | ||
1794 | } | ||
1795 | |||
1796 | static bool btree_insert_key(struct btree *b, struct btree_op *op, | ||
1797 | struct bkey *k) | ||
1798 | { | ||
1799 | struct bset *i = b->sets[b->nsets].data; | ||
1800 | struct bkey *m, *prev; | ||
1801 | const char *status = "insert"; | ||
1802 | |||
1803 | BUG_ON(bkey_cmp(k, &b->key) > 0); | ||
1804 | BUG_ON(b->level && !KEY_PTRS(k)); | ||
1805 | BUG_ON(!b->level && !KEY_OFFSET(k)); | ||
1806 | |||
1807 | if (!b->level) { | ||
1808 | struct btree_iter iter; | ||
1809 | struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); | ||
1810 | |||
1811 | /* | ||
1812 | * bset_search() returns the first key that is strictly greater | ||
1813 | * than the search key - but for back merging, we want to find | ||
1814 | * the first key that is greater than or equal to KEY_START(k) - | ||
1815 | * unless KEY_START(k) is 0. | ||
1816 | */ | ||
1817 | if (KEY_OFFSET(&search)) | ||
1818 | SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); | ||
1819 | |||
1820 | prev = NULL; | ||
1821 | m = bch_btree_iter_init(b, &iter, &search); | ||
1822 | |||
1823 | if (fix_overlapping_extents(b, k, &iter, op)) | ||
1824 | return false; | ||
1825 | |||
1826 | while (m != end(i) && | ||
1827 | bkey_cmp(k, &START_KEY(m)) > 0) | ||
1828 | prev = m, m = bkey_next(m); | ||
1829 | |||
1830 | if (key_merging_disabled(b->c)) | ||
1831 | goto insert; | ||
1832 | |||
1833 | /* prev is in the tree, if we merge we're done */ | ||
1834 | status = "back merging"; | ||
1835 | if (prev && | ||
1836 | bch_bkey_try_merge(b, prev, k)) | ||
1837 | goto merged; | ||
1838 | |||
1839 | status = "overwrote front"; | ||
1840 | if (m != end(i) && | ||
1841 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) | ||
1842 | goto copy; | ||
1843 | |||
1844 | status = "front merge"; | ||
1845 | if (m != end(i) && | ||
1846 | bch_bkey_try_merge(b, k, m)) | ||
1847 | goto copy; | ||
1848 | } else | ||
1849 | m = bch_bset_search(b, &b->sets[b->nsets], k); | ||
1850 | |||
1851 | insert: shift_keys(b, m, k); | ||
1852 | copy: bkey_copy(m, k); | ||
1853 | merged: | ||
1854 | bch_check_keys(b, "%s for %s at %s: %s", status, | ||
1855 | op_type(op), pbtree(b), pkey(k)); | ||
1856 | bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status, | ||
1857 | op_type(op), pbtree(b), pkey(k)); | ||
1858 | |||
1859 | if (b->level && !KEY_OFFSET(k)) | ||
1860 | b->prio_blocked++; | ||
1861 | |||
1862 | pr_debug("%s for %s at %s: %s", status, | ||
1863 | op_type(op), pbtree(b), pkey(k)); | ||
1864 | |||
1865 | return true; | ||
1866 | } | ||
1867 | |||
1868 | bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) | ||
1869 | { | ||
1870 | bool ret = false; | ||
1871 | struct bkey *k; | ||
1872 | unsigned oldsize = bch_count_data(b); | ||
1873 | |||
1874 | while ((k = bch_keylist_pop(&op->keys))) { | ||
1875 | bkey_put(b->c, k, b->level); | ||
1876 | ret |= btree_insert_key(b, op, k); | ||
1877 | } | ||
1878 | |||
1879 | BUG_ON(bch_count_data(b) < oldsize); | ||
1880 | return ret; | ||
1881 | } | ||
1882 | |||
1883 | bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, | ||
1884 | struct bio *bio) | ||
1885 | { | ||
1886 | bool ret = false; | ||
1887 | uint64_t btree_ptr = b->key.ptr[0]; | ||
1888 | unsigned long seq = b->seq; | ||
1889 | BKEY_PADDED(k) tmp; | ||
1890 | |||
1891 | rw_unlock(false, b); | ||
1892 | rw_lock(true, b, b->level); | ||
1893 | |||
1894 | if (b->key.ptr[0] != btree_ptr || | ||
1895 | b->seq != seq + 1 || | ||
1896 | should_split(b)) | ||
1897 | goto out; | ||
1898 | |||
1899 | op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); | ||
1900 | |||
1901 | SET_KEY_PTRS(&op->replace, 1); | ||
1902 | get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); | ||
1903 | |||
1904 | SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); | ||
1905 | |||
1906 | bkey_copy(&tmp.k, &op->replace); | ||
1907 | |||
1908 | BUG_ON(op->type != BTREE_INSERT); | ||
1909 | BUG_ON(!btree_insert_key(b, op, &tmp.k)); | ||
1910 | bch_btree_write(b, false, NULL); | ||
1911 | ret = true; | ||
1912 | out: | ||
1913 | downgrade_write(&b->lock); | ||
1914 | return ret; | ||
1915 | } | ||
1916 | |||
1917 | static int btree_split(struct btree *b, struct btree_op *op) | ||
1918 | { | ||
1919 | bool split, root = b == b->c->root; | ||
1920 | struct btree *n1, *n2 = NULL, *n3 = NULL; | ||
1921 | uint64_t start_time = local_clock(); | ||
1922 | |||
1923 | if (b->level) | ||
1924 | set_closure_blocking(&op->cl); | ||
1925 | |||
1926 | n1 = btree_node_alloc_replacement(b, &op->cl); | ||
1927 | if (IS_ERR(n1)) | ||
1928 | goto err; | ||
1929 | |||
1930 | split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; | ||
1931 | |||
1932 | pr_debug("%ssplitting at %s keys %i", split ? "" : "not ", | ||
1933 | pbtree(b), n1->sets[0].data->keys); | ||
1934 | |||
1935 | if (split) { | ||
1936 | unsigned keys = 0; | ||
1937 | |||
1938 | n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); | ||
1939 | if (IS_ERR(n2)) | ||
1940 | goto err_free1; | ||
1941 | |||
1942 | if (root) { | ||
1943 | n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); | ||
1944 | if (IS_ERR(n3)) | ||
1945 | goto err_free2; | ||
1946 | } | ||
1947 | |||
1948 | bch_btree_insert_keys(n1, op); | ||
1949 | |||
1950 | /* Has to be a linear search because we don't have an auxiliary | ||
1951 | * search tree yet | ||
1952 | */ | ||
1953 | |||
1954 | while (keys < (n1->sets[0].data->keys * 3) / 5) | ||
1955 | keys += bkey_u64s(node(n1->sets[0].data, keys)); | ||
1956 | |||
1957 | bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); | ||
1958 | keys += bkey_u64s(node(n1->sets[0].data, keys)); | ||
1959 | |||
1960 | n2->sets[0].data->keys = n1->sets[0].data->keys - keys; | ||
1961 | n1->sets[0].data->keys = keys; | ||
1962 | |||
1963 | memcpy(n2->sets[0].data->start, | ||
1964 | end(n1->sets[0].data), | ||
1965 | n2->sets[0].data->keys * sizeof(uint64_t)); | ||
1966 | |||
1967 | bkey_copy_key(&n2->key, &b->key); | ||
1968 | |||
1969 | bch_keylist_add(&op->keys, &n2->key); | ||
1970 | bch_btree_write(n2, true, op); | ||
1971 | rw_unlock(true, n2); | ||
1972 | } else | ||
1973 | bch_btree_insert_keys(n1, op); | ||
1974 | |||
1975 | bch_keylist_add(&op->keys, &n1->key); | ||
1976 | bch_btree_write(n1, true, op); | ||
1977 | |||
1978 | if (n3) { | ||
1979 | bkey_copy_key(&n3->key, &MAX_KEY); | ||
1980 | bch_btree_insert_keys(n3, op); | ||
1981 | bch_btree_write(n3, true, op); | ||
1982 | |||
1983 | closure_sync(&op->cl); | ||
1984 | bch_btree_set_root(n3); | ||
1985 | rw_unlock(true, n3); | ||
1986 | } else if (root) { | ||
1987 | op->keys.top = op->keys.bottom; | ||
1988 | closure_sync(&op->cl); | ||
1989 | bch_btree_set_root(n1); | ||
1990 | } else { | ||
1991 | unsigned i; | ||
1992 | |||
1993 | bkey_copy(op->keys.top, &b->key); | ||
1994 | bkey_copy_key(op->keys.top, &ZERO_KEY); | ||
1995 | |||
1996 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | ||
1997 | uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; | ||
1998 | |||
1999 | SET_PTR_GEN(op->keys.top, i, g); | ||
2000 | } | ||
2001 | |||
2002 | bch_keylist_push(&op->keys); | ||
2003 | closure_sync(&op->cl); | ||
2004 | atomic_inc(&b->c->prio_blocked); | ||
2005 | } | ||
2006 | |||
2007 | rw_unlock(true, n1); | ||
2008 | btree_node_free(b, op); | ||
2009 | |||
2010 | bch_time_stats_update(&b->c->btree_split_time, start_time); | ||
2011 | |||
2012 | return 0; | ||
2013 | err_free2: | ||
2014 | __bkey_put(n2->c, &n2->key); | ||
2015 | btree_node_free(n2, op); | ||
2016 | rw_unlock(true, n2); | ||
2017 | err_free1: | ||
2018 | __bkey_put(n1->c, &n1->key); | ||
2019 | btree_node_free(n1, op); | ||
2020 | rw_unlock(true, n1); | ||
2021 | err: | ||
2022 | if (n3 == ERR_PTR(-EAGAIN) || | ||
2023 | n2 == ERR_PTR(-EAGAIN) || | ||
2024 | n1 == ERR_PTR(-EAGAIN)) | ||
2025 | return -EAGAIN; | ||
2026 | |||
2027 | pr_warn("couldn't split"); | ||
2028 | return -ENOMEM; | ||
2029 | } | ||
2030 | |||
2031 | static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, | ||
2032 | struct keylist *stack_keys) | ||
2033 | { | ||
2034 | if (b->level) { | ||
2035 | int ret; | ||
2036 | struct bkey *insert = op->keys.bottom; | ||
2037 | struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); | ||
2038 | |||
2039 | if (!k) { | ||
2040 | btree_bug(b, "no key to recurse on at level %i/%i", | ||
2041 | b->level, b->c->root->level); | ||
2042 | |||
2043 | op->keys.top = op->keys.bottom; | ||
2044 | return -EIO; | ||
2045 | } | ||
2046 | |||
2047 | if (bkey_cmp(insert, k) > 0) { | ||
2048 | unsigned i; | ||
2049 | |||
2050 | if (op->type == BTREE_REPLACE) { | ||
2051 | __bkey_put(b->c, insert); | ||
2052 | op->keys.top = op->keys.bottom; | ||
2053 | op->insert_collision = true; | ||
2054 | return 0; | ||
2055 | } | ||
2056 | |||
2057 | for (i = 0; i < KEY_PTRS(insert); i++) | ||
2058 | atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); | ||
2059 | |||
2060 | bkey_copy(stack_keys->top, insert); | ||
2061 | |||
2062 | bch_cut_back(k, insert); | ||
2063 | bch_cut_front(k, stack_keys->top); | ||
2064 | |||
2065 | bch_keylist_push(stack_keys); | ||
2066 | } | ||
2067 | |||
2068 | ret = btree(insert_recurse, k, b, op, stack_keys); | ||
2069 | if (ret) | ||
2070 | return ret; | ||
2071 | } | ||
2072 | |||
2073 | if (!bch_keylist_empty(&op->keys)) { | ||
2074 | if (should_split(b)) { | ||
2075 | if (op->lock <= b->c->root->level) { | ||
2076 | BUG_ON(b->level); | ||
2077 | op->lock = b->c->root->level + 1; | ||
2078 | return -EINTR; | ||
2079 | } | ||
2080 | return btree_split(b, op); | ||
2081 | } | ||
2082 | |||
2083 | BUG_ON(write_block(b) != b->sets[b->nsets].data); | ||
2084 | |||
2085 | if (bch_btree_insert_keys(b, op)) | ||
2086 | bch_btree_write(b, false, op); | ||
2087 | } | ||
2088 | |||
2089 | return 0; | ||
2090 | } | ||
2091 | |||
2092 | int bch_btree_insert(struct btree_op *op, struct cache_set *c) | ||
2093 | { | ||
2094 | int ret = 0; | ||
2095 | struct keylist stack_keys; | ||
2096 | |||
2097 | /* | ||
2098 | * Don't want to block with the btree locked unless we have to, | ||
2099 | * otherwise we get deadlocks with try_harder and between split/gc | ||
2100 | */ | ||
2101 | clear_closure_blocking(&op->cl); | ||
2102 | |||
2103 | BUG_ON(bch_keylist_empty(&op->keys)); | ||
2104 | bch_keylist_copy(&stack_keys, &op->keys); | ||
2105 | bch_keylist_init(&op->keys); | ||
2106 | |||
2107 | while (!bch_keylist_empty(&stack_keys) || | ||
2108 | !bch_keylist_empty(&op->keys)) { | ||
2109 | if (bch_keylist_empty(&op->keys)) { | ||
2110 | bch_keylist_add(&op->keys, | ||
2111 | bch_keylist_pop(&stack_keys)); | ||
2112 | op->lock = 0; | ||
2113 | } | ||
2114 | |||
2115 | ret = btree_root(insert_recurse, c, op, &stack_keys); | ||
2116 | |||
2117 | if (ret == -EAGAIN) { | ||
2118 | ret = 0; | ||
2119 | closure_sync(&op->cl); | ||
2120 | } else if (ret) { | ||
2121 | struct bkey *k; | ||
2122 | |||
2123 | pr_err("error %i trying to insert key for %s", | ||
2124 | ret, op_type(op)); | ||
2125 | |||
2126 | while ((k = bch_keylist_pop(&stack_keys) ?: | ||
2127 | bch_keylist_pop(&op->keys))) | ||
2128 | bkey_put(c, k, 0); | ||
2129 | } | ||
2130 | } | ||
2131 | |||
2132 | bch_keylist_free(&stack_keys); | ||
2133 | |||
2134 | if (op->journal) | ||
2135 | atomic_dec_bug(op->journal); | ||
2136 | op->journal = NULL; | ||
2137 | return ret; | ||
2138 | } | ||
2139 | |||
2140 | void bch_btree_set_root(struct btree *b) | ||
2141 | { | ||
2142 | unsigned i; | ||
2143 | |||
2144 | BUG_ON(!b->written); | ||
2145 | |||
2146 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
2147 | BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); | ||
2148 | |||
2149 | mutex_lock(&b->c->bucket_lock); | ||
2150 | list_del_init(&b->list); | ||
2151 | mutex_unlock(&b->c->bucket_lock); | ||
2152 | |||
2153 | b->c->root = b; | ||
2154 | __bkey_put(b->c, &b->key); | ||
2155 | |||
2156 | bch_journal_meta(b->c, NULL); | ||
2157 | pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); | ||
2158 | } | ||
2159 | |||
2160 | /* Cache lookup */ | ||
2161 | |||
2162 | static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, | ||
2163 | struct bkey *k) | ||
2164 | { | ||
2165 | struct search *s = container_of(op, struct search, op); | ||
2166 | struct bio *bio = &s->bio.bio; | ||
2167 | int ret = 0; | ||
2168 | |||
2169 | while (!ret && | ||
2170 | !op->lookup_done) { | ||
2171 | unsigned sectors = INT_MAX; | ||
2172 | |||
2173 | if (KEY_INODE(k) == op->inode) { | ||
2174 | if (KEY_START(k) <= bio->bi_sector) | ||
2175 | break; | ||
2176 | |||
2177 | sectors = min_t(uint64_t, sectors, | ||
2178 | KEY_START(k) - bio->bi_sector); | ||
2179 | } | ||
2180 | |||
2181 | ret = s->d->cache_miss(b, s, bio, sectors); | ||
2182 | } | ||
2183 | |||
2184 | return ret; | ||
2185 | } | ||
2186 | |||
2187 | /* | ||
2188 | * Read from a single key, handling the initial cache miss if the key starts in | ||
2189 | * the middle of the bio | ||
2190 | */ | ||
2191 | static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, | ||
2192 | struct bkey *k) | ||
2193 | { | ||
2194 | struct search *s = container_of(op, struct search, op); | ||
2195 | struct bio *bio = &s->bio.bio; | ||
2196 | unsigned ptr; | ||
2197 | struct bio *n; | ||
2198 | |||
2199 | int ret = submit_partial_cache_miss(b, op, k); | ||
2200 | if (ret || op->lookup_done) | ||
2201 | return ret; | ||
2202 | |||
2203 | /* XXX: figure out best pointer - for multiple cache devices */ | ||
2204 | ptr = 0; | ||
2205 | |||
2206 | PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; | ||
2207 | |||
2208 | while (!op->lookup_done && | ||
2209 | KEY_INODE(k) == op->inode && | ||
2210 | bio->bi_sector < KEY_OFFSET(k)) { | ||
2211 | struct bkey *bio_key; | ||
2212 | sector_t sector = PTR_OFFSET(k, ptr) + | ||
2213 | (bio->bi_sector - KEY_START(k)); | ||
2214 | unsigned sectors = min_t(uint64_t, INT_MAX, | ||
2215 | KEY_OFFSET(k) - bio->bi_sector); | ||
2216 | |||
2217 | n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | ||
2218 | if (!n) | ||
2219 | return -EAGAIN; | ||
2220 | |||
2221 | if (n == bio) | ||
2222 | op->lookup_done = true; | ||
2223 | |||
2224 | bio_key = &container_of(n, struct bbio, bio)->key; | ||
2225 | |||
2226 | /* | ||
2227 | * The bucket we're reading from might be reused while our bio | ||
2228 | * is in flight, and we could then end up reading the wrong | ||
2229 | * data. | ||
2230 | * | ||
2231 | * We guard against this by checking (in cache_read_endio()) if | ||
2232 | * the pointer is stale again; if so, we treat it as an error | ||
2233 | * and reread from the backing device (but we don't pass that | ||
2234 | * error up anywhere). | ||
2235 | */ | ||
2236 | |||
2237 | bch_bkey_copy_single_ptr(bio_key, k, ptr); | ||
2238 | SET_PTR_OFFSET(bio_key, 0, sector); | ||
2239 | |||
2240 | n->bi_end_io = bch_cache_read_endio; | ||
2241 | n->bi_private = &s->cl; | ||
2242 | |||
2243 | trace_bcache_cache_hit(n); | ||
2244 | __bch_submit_bbio(n, b->c); | ||
2245 | } | ||
2246 | |||
2247 | return 0; | ||
2248 | } | ||
2249 | |||
2250 | int bch_btree_search_recurse(struct btree *b, struct btree_op *op) | ||
2251 | { | ||
2252 | struct search *s = container_of(op, struct search, op); | ||
2253 | struct bio *bio = &s->bio.bio; | ||
2254 | |||
2255 | int ret = 0; | ||
2256 | struct bkey *k; | ||
2257 | struct btree_iter iter; | ||
2258 | bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); | ||
2259 | |||
2260 | pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode, | ||
2261 | (uint64_t) bio->bi_sector); | ||
2262 | |||
2263 | do { | ||
2264 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); | ||
2265 | if (!k) { | ||
2266 | /* | ||
2267 | * b->key would be exactly what we want, except that | ||
2268 | * pointers to btree nodes have nonzero size - we | ||
2269 | * wouldn't go far enough | ||
2270 | */ | ||
2271 | |||
2272 | ret = submit_partial_cache_miss(b, op, | ||
2273 | &KEY(KEY_INODE(&b->key), | ||
2274 | KEY_OFFSET(&b->key), 0)); | ||
2275 | break; | ||
2276 | } | ||
2277 | |||
2278 | ret = b->level | ||
2279 | ? btree(search_recurse, k, b, op) | ||
2280 | : submit_partial_cache_hit(b, op, k); | ||
2281 | } while (!ret && | ||
2282 | !op->lookup_done); | ||
2283 | |||
2284 | return ret; | ||
2285 | } | ||
2286 | |||
2287 | /* Keybuf code */ | ||
2288 | |||
2289 | static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) | ||
2290 | { | ||
2291 | /* Overlapping keys compare equal */ | ||
2292 | if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) | ||
2293 | return -1; | ||
2294 | if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) | ||
2295 | return 1; | ||
2296 | return 0; | ||
2297 | } | ||
2298 | |||
2299 | static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, | ||
2300 | struct keybuf_key *r) | ||
2301 | { | ||
2302 | return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); | ||
2303 | } | ||
2304 | |||
2305 | static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | ||
2306 | struct keybuf *buf, struct bkey *end) | ||
2307 | { | ||
2308 | struct btree_iter iter; | ||
2309 | bch_btree_iter_init(b, &iter, &buf->last_scanned); | ||
2310 | |||
2311 | while (!array_freelist_empty(&buf->freelist)) { | ||
2312 | struct bkey *k = bch_btree_iter_next_filter(&iter, b, | ||
2313 | bch_ptr_bad); | ||
2314 | |||
2315 | if (!b->level) { | ||
2316 | if (!k) { | ||
2317 | buf->last_scanned = b->key; | ||
2318 | break; | ||
2319 | } | ||
2320 | |||
2321 | buf->last_scanned = *k; | ||
2322 | if (bkey_cmp(&buf->last_scanned, end) >= 0) | ||
2323 | break; | ||
2324 | |||
2325 | if (buf->key_predicate(buf, k)) { | ||
2326 | struct keybuf_key *w; | ||
2327 | |||
2328 | pr_debug("%s", pkey(k)); | ||
2329 | |||
2330 | spin_lock(&buf->lock); | ||
2331 | |||
2332 | w = array_alloc(&buf->freelist); | ||
2333 | |||
2334 | w->private = NULL; | ||
2335 | bkey_copy(&w->key, k); | ||
2336 | |||
2337 | if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) | ||
2338 | array_free(&buf->freelist, w); | ||
2339 | |||
2340 | spin_unlock(&buf->lock); | ||
2341 | } | ||
2342 | } else { | ||
2343 | if (!k) | ||
2344 | break; | ||
2345 | |||
2346 | btree(refill_keybuf, k, b, op, buf, end); | ||
2347 | /* | ||
2348 | * Might get an error here, but can't really do anything | ||
2349 | * and it'll get logged elsewhere. Just read what we | ||
2350 | * can. | ||
2351 | */ | ||
2352 | |||
2353 | if (bkey_cmp(&buf->last_scanned, end) >= 0) | ||
2354 | break; | ||
2355 | |||
2356 | cond_resched(); | ||
2357 | } | ||
2358 | } | ||
2359 | |||
2360 | return 0; | ||
2361 | } | ||
2362 | |||
2363 | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, | ||
2364 | struct bkey *end) | ||
2365 | { | ||
2366 | struct bkey start = buf->last_scanned; | ||
2367 | struct btree_op op; | ||
2368 | bch_btree_op_init_stack(&op); | ||
2369 | |||
2370 | cond_resched(); | ||
2371 | |||
2372 | btree_root(refill_keybuf, c, &op, buf, end); | ||
2373 | closure_sync(&op.cl); | ||
2374 | |||
2375 | pr_debug("found %s keys from %llu:%llu to %llu:%llu", | ||
2376 | RB_EMPTY_ROOT(&buf->keys) ? "no" : | ||
2377 | array_freelist_empty(&buf->freelist) ? "some" : "a few", | ||
2378 | KEY_INODE(&start), KEY_OFFSET(&start), | ||
2379 | KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); | ||
2380 | |||
2381 | spin_lock(&buf->lock); | ||
2382 | |||
2383 | if (!RB_EMPTY_ROOT(&buf->keys)) { | ||
2384 | struct keybuf_key *w; | ||
2385 | w = RB_FIRST(&buf->keys, struct keybuf_key, node); | ||
2386 | buf->start = START_KEY(&w->key); | ||
2387 | |||
2388 | w = RB_LAST(&buf->keys, struct keybuf_key, node); | ||
2389 | buf->end = w->key; | ||
2390 | } else { | ||
2391 | buf->start = MAX_KEY; | ||
2392 | buf->end = MAX_KEY; | ||
2393 | } | ||
2394 | |||
2395 | spin_unlock(&buf->lock); | ||
2396 | } | ||
2397 | |||
2398 | static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) | ||
2399 | { | ||
2400 | rb_erase(&w->node, &buf->keys); | ||
2401 | array_free(&buf->freelist, w); | ||
2402 | } | ||
2403 | |||
2404 | void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) | ||
2405 | { | ||
2406 | spin_lock(&buf->lock); | ||
2407 | __bch_keybuf_del(buf, w); | ||
2408 | spin_unlock(&buf->lock); | ||
2409 | } | ||
2410 | |||
2411 | bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, | ||
2412 | struct bkey *end) | ||
2413 | { | ||
2414 | bool ret = false; | ||
2415 | struct keybuf_key *p, *w, s; | ||
2416 | s.key = *start; | ||
2417 | |||
2418 | if (bkey_cmp(end, &buf->start) <= 0 || | ||
2419 | bkey_cmp(start, &buf->end) >= 0) | ||
2420 | return false; | ||
2421 | |||
2422 | spin_lock(&buf->lock); | ||
2423 | w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); | ||
2424 | |||
2425 | while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { | ||
2426 | p = w; | ||
2427 | w = RB_NEXT(w, node); | ||
2428 | |||
2429 | if (p->private) | ||
2430 | ret = true; | ||
2431 | else | ||
2432 | __bch_keybuf_del(buf, p); | ||
2433 | } | ||
2434 | |||
2435 | spin_unlock(&buf->lock); | ||
2436 | return ret; | ||
2437 | } | ||
2438 | |||
2439 | struct keybuf_key *bch_keybuf_next(struct keybuf *buf) | ||
2440 | { | ||
2441 | struct keybuf_key *w; | ||
2442 | spin_lock(&buf->lock); | ||
2443 | |||
2444 | w = RB_FIRST(&buf->keys, struct keybuf_key, node); | ||
2445 | |||
2446 | while (w && w->private) | ||
2447 | w = RB_NEXT(w, node); | ||
2448 | |||
2449 | if (w) | ||
2450 | w->private = ERR_PTR(-EINTR); | ||
2451 | |||
2452 | spin_unlock(&buf->lock); | ||
2453 | return w; | ||
2454 | } | ||
2455 | |||
2456 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, | ||
2457 | struct keybuf *buf, | ||
2458 | struct bkey *end) | ||
2459 | { | ||
2460 | struct keybuf_key *ret; | ||
2461 | |||
2462 | while (1) { | ||
2463 | ret = bch_keybuf_next(buf); | ||
2464 | if (ret) | ||
2465 | break; | ||
2466 | |||
2467 | if (bkey_cmp(&buf->last_scanned, end) >= 0) { | ||
2468 | pr_debug("scan finished"); | ||
2469 | break; | ||
2470 | } | ||
2471 | |||
2472 | bch_refill_keybuf(c, buf, end); | ||
2473 | } | ||
2474 | |||
2475 | return ret; | ||
2476 | } | ||
2477 | |||
2478 | void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) | ||
2479 | { | ||
2480 | buf->key_predicate = fn; | ||
2481 | buf->last_scanned = MAX_KEY; | ||
2482 | buf->keys = RB_ROOT; | ||
2483 | |||
2484 | spin_lock_init(&buf->lock); | ||
2485 | array_allocator_init(&buf->freelist); | ||
2486 | } | ||
2487 | |||
2488 | void bch_btree_exit(void) | ||
2489 | { | ||
2490 | if (btree_io_wq) | ||
2491 | destroy_workqueue(btree_io_wq); | ||
2492 | if (bch_gc_wq) | ||
2493 | destroy_workqueue(bch_gc_wq); | ||
2494 | } | ||
2495 | |||
2496 | int __init bch_btree_init(void) | ||
2497 | { | ||
2498 | if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || | ||
2499 | !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) | ||
2500 | return -ENOMEM; | ||
2501 | |||
2502 | return 0; | ||
2503 | } | ||
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h new file mode 100644 index 000000000000..af4a7092a28c --- /dev/null +++ b/drivers/md/bcache/btree.h | |||
@@ -0,0 +1,405 @@ | |||
1 | #ifndef _BCACHE_BTREE_H | ||
2 | #define _BCACHE_BTREE_H | ||
3 | |||
4 | /* | ||
5 | * THE BTREE: | ||
6 | * | ||
7 | * At a high level, bcache's btree is relatively standard b+ tree. All keys and | ||
8 | * pointers are in the leaves; interior nodes only have pointers to the child | ||
9 | * nodes. | ||
10 | * | ||
11 | * In the interior nodes, a struct bkey always points to a child btree node, and | ||
12 | * the key is the highest key in the child node - except that the highest key in | ||
13 | * an interior node is always MAX_KEY. The size field refers to the size on disk | ||
14 | * of the child node - this would allow us to have variable sized btree nodes | ||
15 | * (handy for keeping the depth of the btree 1 by expanding just the root). | ||
16 | * | ||
17 | * Btree nodes are themselves log structured, but this is hidden fairly | ||
18 | * thoroughly. Btree nodes on disk will in practice have extents that overlap | ||
19 | * (because they were written at different times), but in memory we never have | ||
20 | * overlapping extents - when we read in a btree node from disk, the first thing | ||
21 | * we do is resort all the sets of keys with a mergesort, and in the same pass | ||
22 | * we check for overlapping extents and adjust them appropriately. | ||
23 | * | ||
24 | * struct btree_op is a central interface to the btree code. It's used for | ||
25 | * specifying read vs. write locking, and the embedded closure is used for | ||
26 | * waiting on IO or reserve memory. | ||
27 | * | ||
28 | * BTREE CACHE: | ||
29 | * | ||
30 | * Btree nodes are cached in memory; traversing the btree might require reading | ||
31 | * in btree nodes which is handled mostly transparently. | ||
32 | * | ||
33 | * bch_btree_node_get() looks up a btree node in the cache and reads it in from | ||
34 | * disk if necessary. This function is almost never called directly though - the | ||
35 | * btree() macro is used to get a btree node, call some function on it, and | ||
36 | * unlock the node after the function returns. | ||
37 | * | ||
38 | * The root is special cased - it's taken out of the cache's lru (thus pinning | ||
39 | * it in memory), so we can find the root of the btree by just dereferencing a | ||
40 | * pointer instead of looking it up in the cache. This makes locking a bit | ||
41 | * tricky, since the root pointer is protected by the lock in the btree node it | ||
42 | * points to - the btree_root() macro handles this. | ||
43 | * | ||
44 | * In various places we must be able to allocate memory for multiple btree nodes | ||
45 | * in order to make forward progress. To do this we use the btree cache itself | ||
46 | * as a reserve; if __get_free_pages() fails, we'll find a node in the btree | ||
47 | * cache we can reuse. We can't allow more than one thread to be doing this at a | ||
48 | * time, so there's a lock, implemented by a pointer to the btree_op closure - | ||
49 | * this allows the btree_root() macro to implicitly release this lock. | ||
50 | * | ||
51 | * BTREE IO: | ||
52 | * | ||
53 | * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles | ||
54 | * this. | ||
55 | * | ||
56 | * For writing, we have two btree_write structs embeddded in struct btree - one | ||
57 | * write in flight, and one being set up, and we toggle between them. | ||
58 | * | ||
59 | * Writing is done with a single function - bch_btree_write() really serves two | ||
60 | * different purposes and should be broken up into two different functions. When | ||
61 | * passing now = false, it merely indicates that the node is now dirty - calling | ||
62 | * it ensures that the dirty keys will be written at some point in the future. | ||
63 | * | ||
64 | * When passing now = true, bch_btree_write() causes a write to happen | ||
65 | * "immediately" (if there was already a write in flight, it'll cause the write | ||
66 | * to happen as soon as the previous write completes). It returns immediately | ||
67 | * though - but it takes a refcount on the closure in struct btree_op you passed | ||
68 | * to it, so a closure_sync() later can be used to wait for the write to | ||
69 | * complete. | ||
70 | * | ||
71 | * This is handy because btree_split() and garbage collection can issue writes | ||
72 | * in parallel, reducing the amount of time they have to hold write locks. | ||
73 | * | ||
74 | * LOCKING: | ||
75 | * | ||
76 | * When traversing the btree, we may need write locks starting at some level - | ||
77 | * inserting a key into the btree will typically only require a write lock on | ||
78 | * the leaf node. | ||
79 | * | ||
80 | * This is specified with the lock field in struct btree_op; lock = 0 means we | ||
81 | * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get() | ||
82 | * checks this field and returns the node with the appropriate lock held. | ||
83 | * | ||
84 | * If, after traversing the btree, the insertion code discovers it has to split | ||
85 | * then it must restart from the root and take new locks - to do this it changes | ||
86 | * the lock field and returns -EINTR, which causes the btree_root() macro to | ||
87 | * loop. | ||
88 | * | ||
89 | * Handling cache misses require a different mechanism for upgrading to a write | ||
90 | * lock. We do cache lookups with only a read lock held, but if we get a cache | ||
91 | * miss and we wish to insert this data into the cache, we have to insert a | ||
92 | * placeholder key to detect races - otherwise, we could race with a write and | ||
93 | * overwrite the data that was just written to the cache with stale data from | ||
94 | * the backing device. | ||
95 | * | ||
96 | * For this we use a sequence number that write locks and unlocks increment - to | ||
97 | * insert the check key it unlocks the btree node and then takes a write lock, | ||
98 | * and fails if the sequence number doesn't match. | ||
99 | */ | ||
100 | |||
101 | #include "bset.h" | ||
102 | #include "debug.h" | ||
103 | |||
104 | struct btree_write { | ||
105 | struct closure *owner; | ||
106 | atomic_t *journal; | ||
107 | |||
108 | /* If btree_split() frees a btree node, it writes a new pointer to that | ||
109 | * btree node indicating it was freed; it takes a refcount on | ||
110 | * c->prio_blocked because we can't write the gens until the new | ||
111 | * pointer is on disk. This allows btree_write_endio() to release the | ||
112 | * refcount that btree_split() took. | ||
113 | */ | ||
114 | int prio_blocked; | ||
115 | }; | ||
116 | |||
117 | struct btree { | ||
118 | /* Hottest entries first */ | ||
119 | struct hlist_node hash; | ||
120 | |||
121 | /* Key/pointer for this btree node */ | ||
122 | BKEY_PADDED(key); | ||
123 | |||
124 | /* Single bit - set when accessed, cleared by shrinker */ | ||
125 | unsigned long accessed; | ||
126 | unsigned long seq; | ||
127 | struct rw_semaphore lock; | ||
128 | struct cache_set *c; | ||
129 | |||
130 | unsigned long flags; | ||
131 | uint16_t written; /* would be nice to kill */ | ||
132 | uint8_t level; | ||
133 | uint8_t nsets; | ||
134 | uint8_t page_order; | ||
135 | |||
136 | /* | ||
137 | * Set of sorted keys - the real btree node - plus a binary search tree | ||
138 | * | ||
139 | * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point | ||
140 | * to the memory we have allocated for this btree node. Additionally, | ||
141 | * set[0]->data points to the entire btree node as it exists on disk. | ||
142 | */ | ||
143 | struct bset_tree sets[MAX_BSETS]; | ||
144 | |||
145 | /* Used to refcount bio splits, also protects b->bio */ | ||
146 | struct closure_with_waitlist io; | ||
147 | |||
148 | /* Gets transferred to w->prio_blocked - see the comment there */ | ||
149 | int prio_blocked; | ||
150 | |||
151 | struct list_head list; | ||
152 | struct delayed_work work; | ||
153 | |||
154 | uint64_t io_start_time; | ||
155 | struct btree_write writes[2]; | ||
156 | struct bio *bio; | ||
157 | }; | ||
158 | |||
159 | #define BTREE_FLAG(flag) \ | ||
160 | static inline bool btree_node_ ## flag(struct btree *b) \ | ||
161 | { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ | ||
162 | \ | ||
163 | static inline void set_btree_node_ ## flag(struct btree *b) \ | ||
164 | { set_bit(BTREE_NODE_ ## flag, &b->flags); } \ | ||
165 | |||
166 | enum btree_flags { | ||
167 | BTREE_NODE_read_done, | ||
168 | BTREE_NODE_io_error, | ||
169 | BTREE_NODE_dirty, | ||
170 | BTREE_NODE_write_idx, | ||
171 | }; | ||
172 | |||
173 | BTREE_FLAG(read_done); | ||
174 | BTREE_FLAG(io_error); | ||
175 | BTREE_FLAG(dirty); | ||
176 | BTREE_FLAG(write_idx); | ||
177 | |||
178 | static inline struct btree_write *btree_current_write(struct btree *b) | ||
179 | { | ||
180 | return b->writes + btree_node_write_idx(b); | ||
181 | } | ||
182 | |||
183 | static inline struct btree_write *btree_prev_write(struct btree *b) | ||
184 | { | ||
185 | return b->writes + (btree_node_write_idx(b) ^ 1); | ||
186 | } | ||
187 | |||
188 | static inline unsigned bset_offset(struct btree *b, struct bset *i) | ||
189 | { | ||
190 | return (((size_t) i) - ((size_t) b->sets->data)) >> 9; | ||
191 | } | ||
192 | |||
193 | static inline struct bset *write_block(struct btree *b) | ||
194 | { | ||
195 | return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); | ||
196 | } | ||
197 | |||
198 | static inline bool bset_written(struct btree *b, struct bset_tree *t) | ||
199 | { | ||
200 | return t->data < write_block(b); | ||
201 | } | ||
202 | |||
203 | static inline bool bkey_written(struct btree *b, struct bkey *k) | ||
204 | { | ||
205 | return k < write_block(b)->start; | ||
206 | } | ||
207 | |||
208 | static inline void set_gc_sectors(struct cache_set *c) | ||
209 | { | ||
210 | atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); | ||
211 | } | ||
212 | |||
213 | static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) | ||
214 | { | ||
215 | return __bch_ptr_invalid(b->c, b->level, k); | ||
216 | } | ||
217 | |||
218 | static inline struct bkey *bch_btree_iter_init(struct btree *b, | ||
219 | struct btree_iter *iter, | ||
220 | struct bkey *search) | ||
221 | { | ||
222 | return __bch_btree_iter_init(b, iter, search, b->sets); | ||
223 | } | ||
224 | |||
225 | /* Looping macros */ | ||
226 | |||
227 | #define for_each_cached_btree(b, c, iter) \ | ||
228 | for (iter = 0; \ | ||
229 | iter < ARRAY_SIZE((c)->bucket_hash); \ | ||
230 | iter++) \ | ||
231 | hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) | ||
232 | |||
233 | #define for_each_key_filter(b, k, iter, filter) \ | ||
234 | for (bch_btree_iter_init((b), (iter), NULL); \ | ||
235 | ((k) = bch_btree_iter_next_filter((iter), b, filter));) | ||
236 | |||
237 | #define for_each_key(b, k, iter) \ | ||
238 | for (bch_btree_iter_init((b), (iter), NULL); \ | ||
239 | ((k) = bch_btree_iter_next(iter));) | ||
240 | |||
241 | /* Recursing down the btree */ | ||
242 | |||
243 | struct btree_op { | ||
244 | struct closure cl; | ||
245 | struct cache_set *c; | ||
246 | |||
247 | /* Journal entry we have a refcount on */ | ||
248 | atomic_t *journal; | ||
249 | |||
250 | /* Bio to be inserted into the cache */ | ||
251 | struct bio *cache_bio; | ||
252 | |||
253 | unsigned inode; | ||
254 | |||
255 | uint16_t write_prio; | ||
256 | |||
257 | /* Btree level at which we start taking write locks */ | ||
258 | short lock; | ||
259 | |||
260 | /* Btree insertion type */ | ||
261 | enum { | ||
262 | BTREE_INSERT, | ||
263 | BTREE_REPLACE | ||
264 | } type:8; | ||
265 | |||
266 | unsigned csum:1; | ||
267 | unsigned skip:1; | ||
268 | unsigned flush_journal:1; | ||
269 | |||
270 | unsigned insert_data_done:1; | ||
271 | unsigned lookup_done:1; | ||
272 | unsigned insert_collision:1; | ||
273 | |||
274 | /* Anything after this point won't get zeroed in do_bio_hook() */ | ||
275 | |||
276 | /* Keys to be inserted */ | ||
277 | struct keylist keys; | ||
278 | BKEY_PADDED(replace); | ||
279 | }; | ||
280 | |||
281 | void bch_btree_op_init_stack(struct btree_op *); | ||
282 | |||
283 | static inline void rw_lock(bool w, struct btree *b, int level) | ||
284 | { | ||
285 | w ? down_write_nested(&b->lock, level + 1) | ||
286 | : down_read_nested(&b->lock, level + 1); | ||
287 | if (w) | ||
288 | b->seq++; | ||
289 | } | ||
290 | |||
291 | static inline void rw_unlock(bool w, struct btree *b) | ||
292 | { | ||
293 | #ifdef CONFIG_BCACHE_EDEBUG | ||
294 | unsigned i; | ||
295 | |||
296 | if (w && | ||
297 | b->key.ptr[0] && | ||
298 | btree_node_read_done(b)) | ||
299 | for (i = 0; i <= b->nsets; i++) | ||
300 | bch_check_key_order(b, b->sets[i].data); | ||
301 | #endif | ||
302 | |||
303 | if (w) | ||
304 | b->seq++; | ||
305 | (w ? up_write : up_read)(&b->lock); | ||
306 | } | ||
307 | |||
308 | #define insert_lock(s, b) ((b)->level <= (s)->lock) | ||
309 | |||
310 | /* | ||
311 | * These macros are for recursing down the btree - they handle the details of | ||
312 | * locking and looking up nodes in the cache for you. They're best treated as | ||
313 | * mere syntax when reading code that uses them. | ||
314 | * | ||
315 | * op->lock determines whether we take a read or a write lock at a given depth. | ||
316 | * If you've got a read lock and find that you need a write lock (i.e. you're | ||
317 | * going to have to split), set op->lock and return -EINTR; btree_root() will | ||
318 | * call you again and you'll have the correct lock. | ||
319 | */ | ||
320 | |||
321 | /** | ||
322 | * btree - recurse down the btree on a specified key | ||
323 | * @fn: function to call, which will be passed the child node | ||
324 | * @key: key to recurse on | ||
325 | * @b: parent btree node | ||
326 | * @op: pointer to struct btree_op | ||
327 | */ | ||
328 | #define btree(fn, key, b, op, ...) \ | ||
329 | ({ \ | ||
330 | int _r, l = (b)->level - 1; \ | ||
331 | bool _w = l <= (op)->lock; \ | ||
332 | struct btree *_b = bch_btree_node_get((b)->c, key, l, op); \ | ||
333 | if (!IS_ERR(_b)) { \ | ||
334 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ | ||
335 | rw_unlock(_w, _b); \ | ||
336 | } else \ | ||
337 | _r = PTR_ERR(_b); \ | ||
338 | _r; \ | ||
339 | }) | ||
340 | |||
341 | /** | ||
342 | * btree_root - call a function on the root of the btree | ||
343 | * @fn: function to call, which will be passed the child node | ||
344 | * @c: cache set | ||
345 | * @op: pointer to struct btree_op | ||
346 | */ | ||
347 | #define btree_root(fn, c, op, ...) \ | ||
348 | ({ \ | ||
349 | int _r = -EINTR; \ | ||
350 | do { \ | ||
351 | struct btree *_b = (c)->root; \ | ||
352 | bool _w = insert_lock(op, _b); \ | ||
353 | rw_lock(_w, _b, _b->level); \ | ||
354 | if (_b == (c)->root && \ | ||
355 | _w == insert_lock(op, _b)) \ | ||
356 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ | ||
357 | rw_unlock(_w, _b); \ | ||
358 | bch_cannibalize_unlock(c, &(op)->cl); \ | ||
359 | } while (_r == -EINTR); \ | ||
360 | \ | ||
361 | _r; \ | ||
362 | }) | ||
363 | |||
364 | static inline bool should_split(struct btree *b) | ||
365 | { | ||
366 | struct bset *i = write_block(b); | ||
367 | return b->written >= btree_blocks(b) || | ||
368 | (i->seq == b->sets[0].data->seq && | ||
369 | b->written + __set_blocks(i, i->keys + 15, b->c) | ||
370 | > btree_blocks(b)); | ||
371 | } | ||
372 | |||
373 | void bch_btree_read_done(struct closure *); | ||
374 | void bch_btree_read(struct btree *); | ||
375 | void bch_btree_write(struct btree *b, bool now, struct btree_op *op); | ||
376 | |||
377 | void bch_cannibalize_unlock(struct cache_set *, struct closure *); | ||
378 | void bch_btree_set_root(struct btree *); | ||
379 | struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); | ||
380 | struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, | ||
381 | int, struct btree_op *); | ||
382 | |||
383 | bool bch_btree_insert_keys(struct btree *, struct btree_op *); | ||
384 | bool bch_btree_insert_check_key(struct btree *, struct btree_op *, | ||
385 | struct bio *); | ||
386 | int bch_btree_insert(struct btree_op *, struct cache_set *); | ||
387 | |||
388 | int bch_btree_search_recurse(struct btree *, struct btree_op *); | ||
389 | |||
390 | void bch_queue_gc(struct cache_set *); | ||
391 | size_t bch_btree_gc_finish(struct cache_set *); | ||
392 | void bch_moving_gc(struct closure *); | ||
393 | int bch_btree_check(struct cache_set *, struct btree_op *); | ||
394 | uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); | ||
395 | |||
396 | void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *); | ||
397 | void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *); | ||
398 | bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, | ||
399 | struct bkey *); | ||
400 | void bch_keybuf_del(struct keybuf *, struct keybuf_key *); | ||
401 | struct keybuf_key *bch_keybuf_next(struct keybuf *); | ||
402 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, | ||
403 | struct keybuf *, struct bkey *); | ||
404 | |||
405 | #endif | ||
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c new file mode 100644 index 000000000000..bd05a9a8c7cf --- /dev/null +++ b/drivers/md/bcache/closure.c | |||
@@ -0,0 +1,345 @@ | |||
1 | /* | ||
2 | * Asynchronous refcounty things | ||
3 | * | ||
4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
5 | * Copyright 2012 Google, Inc. | ||
6 | */ | ||
7 | |||
8 | #include <linux/debugfs.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/seq_file.h> | ||
11 | |||
12 | #include "closure.h" | ||
13 | |||
14 | void closure_queue(struct closure *cl) | ||
15 | { | ||
16 | struct workqueue_struct *wq = cl->wq; | ||
17 | if (wq) { | ||
18 | INIT_WORK(&cl->work, cl->work.func); | ||
19 | BUG_ON(!queue_work(wq, &cl->work)); | ||
20 | } else | ||
21 | cl->fn(cl); | ||
22 | } | ||
23 | EXPORT_SYMBOL_GPL(closure_queue); | ||
24 | |||
25 | #define CL_FIELD(type, field) \ | ||
26 | case TYPE_ ## type: \ | ||
27 | return &container_of(cl, struct type, cl)->field | ||
28 | |||
29 | static struct closure_waitlist *closure_waitlist(struct closure *cl) | ||
30 | { | ||
31 | switch (cl->type) { | ||
32 | CL_FIELD(closure_with_waitlist, wait); | ||
33 | CL_FIELD(closure_with_waitlist_and_timer, wait); | ||
34 | default: | ||
35 | return NULL; | ||
36 | } | ||
37 | } | ||
38 | |||
39 | static struct timer_list *closure_timer(struct closure *cl) | ||
40 | { | ||
41 | switch (cl->type) { | ||
42 | CL_FIELD(closure_with_timer, timer); | ||
43 | CL_FIELD(closure_with_waitlist_and_timer, timer); | ||
44 | default: | ||
45 | return NULL; | ||
46 | } | ||
47 | } | ||
48 | |||
49 | static inline void closure_put_after_sub(struct closure *cl, int flags) | ||
50 | { | ||
51 | int r = flags & CLOSURE_REMAINING_MASK; | ||
52 | |||
53 | BUG_ON(flags & CLOSURE_GUARD_MASK); | ||
54 | BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING))); | ||
55 | |||
56 | /* Must deliver precisely one wakeup */ | ||
57 | if (r == 1 && (flags & CLOSURE_SLEEPING)) | ||
58 | wake_up_process(cl->task); | ||
59 | |||
60 | if (!r) { | ||
61 | if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { | ||
62 | /* CLOSURE_BLOCKING might be set - clear it */ | ||
63 | atomic_set(&cl->remaining, | ||
64 | CLOSURE_REMAINING_INITIALIZER); | ||
65 | closure_queue(cl); | ||
66 | } else { | ||
67 | struct closure *parent = cl->parent; | ||
68 | struct closure_waitlist *wait = closure_waitlist(cl); | ||
69 | |||
70 | closure_debug_destroy(cl); | ||
71 | |||
72 | atomic_set(&cl->remaining, -1); | ||
73 | |||
74 | if (wait) | ||
75 | closure_wake_up(wait); | ||
76 | |||
77 | if (cl->fn) | ||
78 | cl->fn(cl); | ||
79 | |||
80 | if (parent) | ||
81 | closure_put(parent); | ||
82 | } | ||
83 | } | ||
84 | } | ||
85 | |||
86 | /* For clearing flags with the same atomic op as a put */ | ||
87 | void closure_sub(struct closure *cl, int v) | ||
88 | { | ||
89 | closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); | ||
90 | } | ||
91 | EXPORT_SYMBOL_GPL(closure_sub); | ||
92 | |||
93 | void closure_put(struct closure *cl) | ||
94 | { | ||
95 | closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); | ||
96 | } | ||
97 | EXPORT_SYMBOL_GPL(closure_put); | ||
98 | |||
99 | static void set_waiting(struct closure *cl, unsigned long f) | ||
100 | { | ||
101 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||
102 | cl->waiting_on = f; | ||
103 | #endif | ||
104 | } | ||
105 | |||
106 | void __closure_wake_up(struct closure_waitlist *wait_list) | ||
107 | { | ||
108 | struct llist_node *list; | ||
109 | struct closure *cl; | ||
110 | struct llist_node *reverse = NULL; | ||
111 | |||
112 | list = llist_del_all(&wait_list->list); | ||
113 | |||
114 | /* We first reverse the list to preserve FIFO ordering and fairness */ | ||
115 | |||
116 | while (list) { | ||
117 | struct llist_node *t = list; | ||
118 | list = llist_next(list); | ||
119 | |||
120 | t->next = reverse; | ||
121 | reverse = t; | ||
122 | } | ||
123 | |||
124 | /* Then do the wakeups */ | ||
125 | |||
126 | while (reverse) { | ||
127 | cl = container_of(reverse, struct closure, list); | ||
128 | reverse = llist_next(reverse); | ||
129 | |||
130 | set_waiting(cl, 0); | ||
131 | closure_sub(cl, CLOSURE_WAITING + 1); | ||
132 | } | ||
133 | } | ||
134 | EXPORT_SYMBOL_GPL(__closure_wake_up); | ||
135 | |||
136 | bool closure_wait(struct closure_waitlist *list, struct closure *cl) | ||
137 | { | ||
138 | if (atomic_read(&cl->remaining) & CLOSURE_WAITING) | ||
139 | return false; | ||
140 | |||
141 | set_waiting(cl, _RET_IP_); | ||
142 | atomic_add(CLOSURE_WAITING + 1, &cl->remaining); | ||
143 | llist_add(&cl->list, &list->list); | ||
144 | |||
145 | return true; | ||
146 | } | ||
147 | EXPORT_SYMBOL_GPL(closure_wait); | ||
148 | |||
149 | /** | ||
150 | * closure_sync() - sleep until a closure a closure has nothing left to wait on | ||
151 | * | ||
152 | * Sleeps until the refcount hits 1 - the thread that's running the closure owns | ||
153 | * the last refcount. | ||
154 | */ | ||
155 | void closure_sync(struct closure *cl) | ||
156 | { | ||
157 | while (1) { | ||
158 | __closure_start_sleep(cl); | ||
159 | closure_set_ret_ip(cl); | ||
160 | |||
161 | if ((atomic_read(&cl->remaining) & | ||
162 | CLOSURE_REMAINING_MASK) == 1) | ||
163 | break; | ||
164 | |||
165 | schedule(); | ||
166 | } | ||
167 | |||
168 | __closure_end_sleep(cl); | ||
169 | } | ||
170 | EXPORT_SYMBOL_GPL(closure_sync); | ||
171 | |||
172 | /** | ||
173 | * closure_trylock() - try to acquire the closure, without waiting | ||
174 | * @cl: closure to lock | ||
175 | * | ||
176 | * Returns true if the closure was succesfully locked. | ||
177 | */ | ||
178 | bool closure_trylock(struct closure *cl, struct closure *parent) | ||
179 | { | ||
180 | if (atomic_cmpxchg(&cl->remaining, -1, | ||
181 | CLOSURE_REMAINING_INITIALIZER) != -1) | ||
182 | return false; | ||
183 | |||
184 | closure_set_ret_ip(cl); | ||
185 | |||
186 | smp_mb(); | ||
187 | cl->parent = parent; | ||
188 | if (parent) | ||
189 | closure_get(parent); | ||
190 | |||
191 | closure_debug_create(cl); | ||
192 | return true; | ||
193 | } | ||
194 | EXPORT_SYMBOL_GPL(closure_trylock); | ||
195 | |||
196 | void __closure_lock(struct closure *cl, struct closure *parent, | ||
197 | struct closure_waitlist *wait_list) | ||
198 | { | ||
199 | struct closure wait; | ||
200 | closure_init_stack(&wait); | ||
201 | |||
202 | while (1) { | ||
203 | if (closure_trylock(cl, parent)) | ||
204 | return; | ||
205 | |||
206 | closure_wait_event_sync(wait_list, &wait, | ||
207 | atomic_read(&cl->remaining) == -1); | ||
208 | } | ||
209 | } | ||
210 | EXPORT_SYMBOL_GPL(__closure_lock); | ||
211 | |||
212 | static void closure_delay_timer_fn(unsigned long data) | ||
213 | { | ||
214 | struct closure *cl = (struct closure *) data; | ||
215 | closure_sub(cl, CLOSURE_TIMER + 1); | ||
216 | } | ||
217 | |||
218 | void do_closure_timer_init(struct closure *cl) | ||
219 | { | ||
220 | struct timer_list *timer = closure_timer(cl); | ||
221 | |||
222 | init_timer(timer); | ||
223 | timer->data = (unsigned long) cl; | ||
224 | timer->function = closure_delay_timer_fn; | ||
225 | } | ||
226 | EXPORT_SYMBOL_GPL(do_closure_timer_init); | ||
227 | |||
228 | bool __closure_delay(struct closure *cl, unsigned long delay, | ||
229 | struct timer_list *timer) | ||
230 | { | ||
231 | if (atomic_read(&cl->remaining) & CLOSURE_TIMER) | ||
232 | return false; | ||
233 | |||
234 | BUG_ON(timer_pending(timer)); | ||
235 | |||
236 | timer->expires = jiffies + delay; | ||
237 | |||
238 | atomic_add(CLOSURE_TIMER + 1, &cl->remaining); | ||
239 | add_timer(timer); | ||
240 | return true; | ||
241 | } | ||
242 | EXPORT_SYMBOL_GPL(__closure_delay); | ||
243 | |||
244 | void __closure_flush(struct closure *cl, struct timer_list *timer) | ||
245 | { | ||
246 | if (del_timer(timer)) | ||
247 | closure_sub(cl, CLOSURE_TIMER + 1); | ||
248 | } | ||
249 | EXPORT_SYMBOL_GPL(__closure_flush); | ||
250 | |||
251 | void __closure_flush_sync(struct closure *cl, struct timer_list *timer) | ||
252 | { | ||
253 | if (del_timer_sync(timer)) | ||
254 | closure_sub(cl, CLOSURE_TIMER + 1); | ||
255 | } | ||
256 | EXPORT_SYMBOL_GPL(__closure_flush_sync); | ||
257 | |||
258 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||
259 | |||
260 | static LIST_HEAD(closure_list); | ||
261 | static DEFINE_SPINLOCK(closure_list_lock); | ||
262 | |||
263 | void closure_debug_create(struct closure *cl) | ||
264 | { | ||
265 | unsigned long flags; | ||
266 | |||
267 | BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); | ||
268 | cl->magic = CLOSURE_MAGIC_ALIVE; | ||
269 | |||
270 | spin_lock_irqsave(&closure_list_lock, flags); | ||
271 | list_add(&cl->all, &closure_list); | ||
272 | spin_unlock_irqrestore(&closure_list_lock, flags); | ||
273 | } | ||
274 | EXPORT_SYMBOL_GPL(closure_debug_create); | ||
275 | |||
276 | void closure_debug_destroy(struct closure *cl) | ||
277 | { | ||
278 | unsigned long flags; | ||
279 | |||
280 | BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); | ||
281 | cl->magic = CLOSURE_MAGIC_DEAD; | ||
282 | |||
283 | spin_lock_irqsave(&closure_list_lock, flags); | ||
284 | list_del(&cl->all); | ||
285 | spin_unlock_irqrestore(&closure_list_lock, flags); | ||
286 | } | ||
287 | EXPORT_SYMBOL_GPL(closure_debug_destroy); | ||
288 | |||
289 | static struct dentry *debug; | ||
290 | |||
291 | #define work_data_bits(work) ((unsigned long *)(&(work)->data)) | ||
292 | |||
293 | static int debug_seq_show(struct seq_file *f, void *data) | ||
294 | { | ||
295 | struct closure *cl; | ||
296 | spin_lock_irq(&closure_list_lock); | ||
297 | |||
298 | list_for_each_entry(cl, &closure_list, all) { | ||
299 | int r = atomic_read(&cl->remaining); | ||
300 | |||
301 | seq_printf(f, "%p: %pF -> %pf p %p r %i ", | ||
302 | cl, (void *) cl->ip, cl->fn, cl->parent, | ||
303 | r & CLOSURE_REMAINING_MASK); | ||
304 | |||
305 | seq_printf(f, "%s%s%s%s%s%s\n", | ||
306 | test_bit(WORK_STRUCT_PENDING, | ||
307 | work_data_bits(&cl->work)) ? "Q" : "", | ||
308 | r & CLOSURE_RUNNING ? "R" : "", | ||
309 | r & CLOSURE_BLOCKING ? "B" : "", | ||
310 | r & CLOSURE_STACK ? "S" : "", | ||
311 | r & CLOSURE_SLEEPING ? "Sl" : "", | ||
312 | r & CLOSURE_TIMER ? "T" : ""); | ||
313 | |||
314 | if (r & CLOSURE_WAITING) | ||
315 | seq_printf(f, " W %pF\n", | ||
316 | (void *) cl->waiting_on); | ||
317 | |||
318 | seq_printf(f, "\n"); | ||
319 | } | ||
320 | |||
321 | spin_unlock_irq(&closure_list_lock); | ||
322 | return 0; | ||
323 | } | ||
324 | |||
325 | static int debug_seq_open(struct inode *inode, struct file *file) | ||
326 | { | ||
327 | return single_open(file, debug_seq_show, NULL); | ||
328 | } | ||
329 | |||
330 | static const struct file_operations debug_ops = { | ||
331 | .owner = THIS_MODULE, | ||
332 | .open = debug_seq_open, | ||
333 | .read = seq_read, | ||
334 | .release = single_release | ||
335 | }; | ||
336 | |||
337 | void __init closure_debug_init(void) | ||
338 | { | ||
339 | debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops); | ||
340 | } | ||
341 | |||
342 | #endif | ||
343 | |||
344 | MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>"); | ||
345 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h new file mode 100644 index 000000000000..00039924ea9d --- /dev/null +++ b/drivers/md/bcache/closure.h | |||
@@ -0,0 +1,672 @@ | |||
1 | #ifndef _LINUX_CLOSURE_H | ||
2 | #define _LINUX_CLOSURE_H | ||
3 | |||
4 | #include <linux/llist.h> | ||
5 | #include <linux/sched.h> | ||
6 | #include <linux/workqueue.h> | ||
7 | |||
8 | /* | ||
9 | * Closure is perhaps the most overused and abused term in computer science, but | ||
10 | * since I've been unable to come up with anything better you're stuck with it | ||
11 | * again. | ||
12 | * | ||
13 | * What are closures? | ||
14 | * | ||
15 | * They embed a refcount. The basic idea is they count "things that are in | ||
16 | * progress" - in flight bios, some other thread that's doing something else - | ||
17 | * anything you might want to wait on. | ||
18 | * | ||
19 | * The refcount may be manipulated with closure_get() and closure_put(). | ||
20 | * closure_put() is where many of the interesting things happen, when it causes | ||
21 | * the refcount to go to 0. | ||
22 | * | ||
23 | * Closures can be used to wait on things both synchronously and asynchronously, | ||
24 | * and synchronous and asynchronous use can be mixed without restriction. To | ||
25 | * wait synchronously, use closure_sync() - you will sleep until your closure's | ||
26 | * refcount hits 1. | ||
27 | * | ||
28 | * To wait asynchronously, use | ||
29 | * continue_at(cl, next_function, workqueue); | ||
30 | * | ||
31 | * passing it, as you might expect, the function to run when nothing is pending | ||
32 | * and the workqueue to run that function out of. | ||
33 | * | ||
34 | * continue_at() also, critically, is a macro that returns the calling function. | ||
35 | * There's good reason for this. | ||
36 | * | ||
37 | * To use safely closures asynchronously, they must always have a refcount while | ||
38 | * they are running owned by the thread that is running them. Otherwise, suppose | ||
39 | * you submit some bios and wish to have a function run when they all complete: | ||
40 | * | ||
41 | * foo_endio(struct bio *bio, int error) | ||
42 | * { | ||
43 | * closure_put(cl); | ||
44 | * } | ||
45 | * | ||
46 | * closure_init(cl); | ||
47 | * | ||
48 | * do_stuff(); | ||
49 | * closure_get(cl); | ||
50 | * bio1->bi_endio = foo_endio; | ||
51 | * bio_submit(bio1); | ||
52 | * | ||
53 | * do_more_stuff(); | ||
54 | * closure_get(cl); | ||
55 | * bio2->bi_endio = foo_endio; | ||
56 | * bio_submit(bio2); | ||
57 | * | ||
58 | * continue_at(cl, complete_some_read, system_wq); | ||
59 | * | ||
60 | * If closure's refcount started at 0, complete_some_read() could run before the | ||
61 | * second bio was submitted - which is almost always not what you want! More | ||
62 | * importantly, it wouldn't be possible to say whether the original thread or | ||
63 | * complete_some_read()'s thread owned the closure - and whatever state it was | ||
64 | * associated with! | ||
65 | * | ||
66 | * So, closure_init() initializes a closure's refcount to 1 - and when a | ||
67 | * closure_fn is run, the refcount will be reset to 1 first. | ||
68 | * | ||
69 | * Then, the rule is - if you got the refcount with closure_get(), release it | ||
70 | * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount | ||
71 | * on a closure because you called closure_init() or you were run out of a | ||
72 | * closure - _always_ use continue_at(). Doing so consistently will help | ||
73 | * eliminate an entire class of particularly pernicious races. | ||
74 | * | ||
75 | * For a closure to wait on an arbitrary event, we need to introduce waitlists: | ||
76 | * | ||
77 | * struct closure_waitlist list; | ||
78 | * closure_wait_event(list, cl, condition); | ||
79 | * closure_wake_up(wait_list); | ||
80 | * | ||
81 | * These work analagously to wait_event() and wake_up() - except that instead of | ||
82 | * operating on the current thread (for wait_event()) and lists of threads, they | ||
83 | * operate on an explicit closure and lists of closures. | ||
84 | * | ||
85 | * Because it's a closure we can now wait either synchronously or | ||
86 | * asynchronously. closure_wait_event() returns the current value of the | ||
87 | * condition, and if it returned false continue_at() or closure_sync() can be | ||
88 | * used to wait for it to become true. | ||
89 | * | ||
90 | * It's useful for waiting on things when you can't sleep in the context in | ||
91 | * which you must check the condition (perhaps a spinlock held, or you might be | ||
92 | * beneath generic_make_request() - in which case you can't sleep on IO). | ||
93 | * | ||
94 | * closure_wait_event() will wait either synchronously or asynchronously, | ||
95 | * depending on whether the closure is in blocking mode or not. You can pick a | ||
96 | * mode explicitly with closure_wait_event_sync() and | ||
97 | * closure_wait_event_async(), which do just what you might expect. | ||
98 | * | ||
99 | * Lastly, you might have a wait list dedicated to a specific event, and have no | ||
100 | * need for specifying the condition - you just want to wait until someone runs | ||
101 | * closure_wake_up() on the appropriate wait list. In that case, just use | ||
102 | * closure_wait(). It will return either true or false, depending on whether the | ||
103 | * closure was already on a wait list or not - a closure can only be on one wait | ||
104 | * list at a time. | ||
105 | * | ||
106 | * Parents: | ||
107 | * | ||
108 | * closure_init() takes two arguments - it takes the closure to initialize, and | ||
109 | * a (possibly null) parent. | ||
110 | * | ||
111 | * If parent is non null, the new closure will have a refcount for its lifetime; | ||
112 | * a closure is considered to be "finished" when its refcount hits 0 and the | ||
113 | * function to run is null. Hence | ||
114 | * | ||
115 | * continue_at(cl, NULL, NULL); | ||
116 | * | ||
117 | * returns up the (spaghetti) stack of closures, precisely like normal return | ||
118 | * returns up the C stack. continue_at() with non null fn is better thought of | ||
119 | * as doing a tail call. | ||
120 | * | ||
121 | * All this implies that a closure should typically be embedded in a particular | ||
122 | * struct (which its refcount will normally control the lifetime of), and that | ||
123 | * struct can very much be thought of as a stack frame. | ||
124 | * | ||
125 | * Locking: | ||
126 | * | ||
127 | * Closures are based on work items but they can be thought of as more like | ||
128 | * threads - in that like threads and unlike work items they have a well | ||
129 | * defined lifetime; they are created (with closure_init()) and eventually | ||
130 | * complete after a continue_at(cl, NULL, NULL). | ||
131 | * | ||
132 | * Suppose you've got some larger structure with a closure embedded in it that's | ||
133 | * used for periodically doing garbage collection. You only want one garbage | ||
134 | * collection happening at a time, so the natural thing to do is protect it with | ||
135 | * a lock. However, it's difficult to use a lock protecting a closure correctly | ||
136 | * because the unlock should come after the last continue_to() (additionally, if | ||
137 | * you're using the closure asynchronously a mutex won't work since a mutex has | ||
138 | * to be unlocked by the same process that locked it). | ||
139 | * | ||
140 | * So to make it less error prone and more efficient, we also have the ability | ||
141 | * to use closures as locks: | ||
142 | * | ||
143 | * closure_init_unlocked(); | ||
144 | * closure_trylock(); | ||
145 | * | ||
146 | * That's all we need for trylock() - the last closure_put() implicitly unlocks | ||
147 | * it for you. But for closure_lock(), we also need a wait list: | ||
148 | * | ||
149 | * struct closure_with_waitlist frobnicator_cl; | ||
150 | * | ||
151 | * closure_init_unlocked(&frobnicator_cl); | ||
152 | * closure_lock(&frobnicator_cl); | ||
153 | * | ||
154 | * A closure_with_waitlist embeds a closure and a wait list - much like struct | ||
155 | * delayed_work embeds a work item and a timer_list. The important thing is, use | ||
156 | * it exactly like you would a regular closure and closure_put() will magically | ||
157 | * handle everything for you. | ||
158 | * | ||
159 | * We've got closures that embed timers, too. They're called, appropriately | ||
160 | * enough: | ||
161 | * struct closure_with_timer; | ||
162 | * | ||
163 | * This gives you access to closure_delay(). It takes a refcount for a specified | ||
164 | * number of jiffies - you could then call closure_sync() (for a slightly | ||
165 | * convoluted version of msleep()) or continue_at() - which gives you the same | ||
166 | * effect as using a delayed work item, except you can reuse the work_struct | ||
167 | * already embedded in struct closure. | ||
168 | * | ||
169 | * Lastly, there's struct closure_with_waitlist_and_timer. It does what you | ||
170 | * probably expect, if you happen to need the features of both. (You don't | ||
171 | * really want to know how all this is implemented, but if I've done my job | ||
172 | * right you shouldn't have to care). | ||
173 | */ | ||
174 | |||
175 | struct closure; | ||
176 | typedef void (closure_fn) (struct closure *); | ||
177 | |||
178 | struct closure_waitlist { | ||
179 | struct llist_head list; | ||
180 | }; | ||
181 | |||
182 | enum closure_type { | ||
183 | TYPE_closure = 0, | ||
184 | TYPE_closure_with_waitlist = 1, | ||
185 | TYPE_closure_with_timer = 2, | ||
186 | TYPE_closure_with_waitlist_and_timer = 3, | ||
187 | MAX_CLOSURE_TYPE = 3, | ||
188 | }; | ||
189 | |||
190 | enum closure_state { | ||
191 | /* | ||
192 | * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of | ||
193 | * waiting asynchronously | ||
194 | * | ||
195 | * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by | ||
196 | * the thread that owns the closure, and cleared by the thread that's | ||
197 | * waking up the closure. | ||
198 | * | ||
199 | * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep | ||
200 | * - indicates that cl->task is valid and closure_put() may wake it up. | ||
201 | * Only set or cleared by the thread that owns the closure. | ||
202 | * | ||
203 | * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure | ||
204 | * has an outstanding timer. Must be set by the thread that owns the | ||
205 | * closure, and cleared by the timer function when the timer goes off. | ||
206 | * | ||
207 | * The rest are for debugging and don't affect behaviour: | ||
208 | * | ||
209 | * CLOSURE_RUNNING: Set when a closure is running (i.e. by | ||
210 | * closure_init() and when closure_put() runs then next function), and | ||
211 | * must be cleared before remaining hits 0. Primarily to help guard | ||
212 | * against incorrect usage and accidentally transferring references. | ||
213 | * continue_at() and closure_return() clear it for you, if you're doing | ||
214 | * something unusual you can use closure_set_dead() which also helps | ||
215 | * annotate where references are being transferred. | ||
216 | * | ||
217 | * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a | ||
218 | * closure with this flag set | ||
219 | */ | ||
220 | |||
221 | CLOSURE_BITS_START = (1 << 19), | ||
222 | CLOSURE_DESTRUCTOR = (1 << 19), | ||
223 | CLOSURE_BLOCKING = (1 << 21), | ||
224 | CLOSURE_WAITING = (1 << 23), | ||
225 | CLOSURE_SLEEPING = (1 << 25), | ||
226 | CLOSURE_TIMER = (1 << 27), | ||
227 | CLOSURE_RUNNING = (1 << 29), | ||
228 | CLOSURE_STACK = (1 << 31), | ||
229 | }; | ||
230 | |||
231 | #define CLOSURE_GUARD_MASK \ | ||
232 | ((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING| \ | ||
233 | CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1) | ||
234 | |||
235 | #define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) | ||
236 | #define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) | ||
237 | |||
238 | struct closure { | ||
239 | union { | ||
240 | struct { | ||
241 | struct workqueue_struct *wq; | ||
242 | struct task_struct *task; | ||
243 | struct llist_node list; | ||
244 | closure_fn *fn; | ||
245 | }; | ||
246 | struct work_struct work; | ||
247 | }; | ||
248 | |||
249 | struct closure *parent; | ||
250 | |||
251 | atomic_t remaining; | ||
252 | |||
253 | enum closure_type type; | ||
254 | |||
255 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||
256 | #define CLOSURE_MAGIC_DEAD 0xc054dead | ||
257 | #define CLOSURE_MAGIC_ALIVE 0xc054a11e | ||
258 | |||
259 | unsigned magic; | ||
260 | struct list_head all; | ||
261 | unsigned long ip; | ||
262 | unsigned long waiting_on; | ||
263 | #endif | ||
264 | }; | ||
265 | |||
266 | struct closure_with_waitlist { | ||
267 | struct closure cl; | ||
268 | struct closure_waitlist wait; | ||
269 | }; | ||
270 | |||
271 | struct closure_with_timer { | ||
272 | struct closure cl; | ||
273 | struct timer_list timer; | ||
274 | }; | ||
275 | |||
276 | struct closure_with_waitlist_and_timer { | ||
277 | struct closure cl; | ||
278 | struct closure_waitlist wait; | ||
279 | struct timer_list timer; | ||
280 | }; | ||
281 | |||
282 | extern unsigned invalid_closure_type(void); | ||
283 | |||
284 | #define __CLOSURE_TYPE(cl, _t) \ | ||
285 | __builtin_types_compatible_p(typeof(cl), struct _t) \ | ||
286 | ? TYPE_ ## _t : \ | ||
287 | |||
288 | #define __closure_type(cl) \ | ||
289 | ( \ | ||
290 | __CLOSURE_TYPE(cl, closure) \ | ||
291 | __CLOSURE_TYPE(cl, closure_with_waitlist) \ | ||
292 | __CLOSURE_TYPE(cl, closure_with_timer) \ | ||
293 | __CLOSURE_TYPE(cl, closure_with_waitlist_and_timer) \ | ||
294 | invalid_closure_type() \ | ||
295 | ) | ||
296 | |||
297 | void closure_sub(struct closure *cl, int v); | ||
298 | void closure_put(struct closure *cl); | ||
299 | void closure_queue(struct closure *cl); | ||
300 | void __closure_wake_up(struct closure_waitlist *list); | ||
301 | bool closure_wait(struct closure_waitlist *list, struct closure *cl); | ||
302 | void closure_sync(struct closure *cl); | ||
303 | |||
304 | bool closure_trylock(struct closure *cl, struct closure *parent); | ||
305 | void __closure_lock(struct closure *cl, struct closure *parent, | ||
306 | struct closure_waitlist *wait_list); | ||
307 | |||
308 | void do_closure_timer_init(struct closure *cl); | ||
309 | bool __closure_delay(struct closure *cl, unsigned long delay, | ||
310 | struct timer_list *timer); | ||
311 | void __closure_flush(struct closure *cl, struct timer_list *timer); | ||
312 | void __closure_flush_sync(struct closure *cl, struct timer_list *timer); | ||
313 | |||
314 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||
315 | |||
316 | void closure_debug_init(void); | ||
317 | void closure_debug_create(struct closure *cl); | ||
318 | void closure_debug_destroy(struct closure *cl); | ||
319 | |||
320 | #else | ||
321 | |||
322 | static inline void closure_debug_init(void) {} | ||
323 | static inline void closure_debug_create(struct closure *cl) {} | ||
324 | static inline void closure_debug_destroy(struct closure *cl) {} | ||
325 | |||
326 | #endif | ||
327 | |||
328 | static inline void closure_set_ip(struct closure *cl) | ||
329 | { | ||
330 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||
331 | cl->ip = _THIS_IP_; | ||
332 | #endif | ||
333 | } | ||
334 | |||
335 | static inline void closure_set_ret_ip(struct closure *cl) | ||
336 | { | ||
337 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||
338 | cl->ip = _RET_IP_; | ||
339 | #endif | ||
340 | } | ||
341 | |||
342 | static inline void closure_get(struct closure *cl) | ||
343 | { | ||
344 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | ||
345 | BUG_ON((atomic_inc_return(&cl->remaining) & | ||
346 | CLOSURE_REMAINING_MASK) <= 1); | ||
347 | #else | ||
348 | atomic_inc(&cl->remaining); | ||
349 | #endif | ||
350 | } | ||
351 | |||
352 | static inline void closure_set_stopped(struct closure *cl) | ||
353 | { | ||
354 | atomic_sub(CLOSURE_RUNNING, &cl->remaining); | ||
355 | } | ||
356 | |||
357 | static inline bool closure_is_stopped(struct closure *cl) | ||
358 | { | ||
359 | return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING); | ||
360 | } | ||
361 | |||
362 | static inline bool closure_is_unlocked(struct closure *cl) | ||
363 | { | ||
364 | return atomic_read(&cl->remaining) == -1; | ||
365 | } | ||
366 | |||
367 | static inline void do_closure_init(struct closure *cl, struct closure *parent, | ||
368 | bool running) | ||
369 | { | ||
370 | switch (cl->type) { | ||
371 | case TYPE_closure_with_timer: | ||
372 | case TYPE_closure_with_waitlist_and_timer: | ||
373 | do_closure_timer_init(cl); | ||
374 | default: | ||
375 | break; | ||
376 | } | ||
377 | |||
378 | cl->parent = parent; | ||
379 | if (parent) | ||
380 | closure_get(parent); | ||
381 | |||
382 | if (running) { | ||
383 | closure_debug_create(cl); | ||
384 | atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); | ||
385 | } else | ||
386 | atomic_set(&cl->remaining, -1); | ||
387 | |||
388 | closure_set_ip(cl); | ||
389 | } | ||
390 | |||
391 | /* | ||
392 | * Hack to get at the embedded closure if there is one, by doing an unsafe cast: | ||
393 | * the result of __closure_type() is thrown away, it's used merely for type | ||
394 | * checking. | ||
395 | */ | ||
396 | #define __to_internal_closure(cl) \ | ||
397 | ({ \ | ||
398 | BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE); \ | ||
399 | (struct closure *) cl; \ | ||
400 | }) | ||
401 | |||
402 | #define closure_init_type(cl, parent, running) \ | ||
403 | do { \ | ||
404 | struct closure *_cl = __to_internal_closure(cl); \ | ||
405 | _cl->type = __closure_type(*(cl)); \ | ||
406 | do_closure_init(_cl, parent, running); \ | ||
407 | } while (0) | ||
408 | |||
409 | /** | ||
410 | * __closure_init() - Initialize a closure, skipping the memset() | ||
411 | * | ||
412 | * May be used instead of closure_init() when memory has already been zeroed. | ||
413 | */ | ||
414 | #define __closure_init(cl, parent) \ | ||
415 | closure_init_type(cl, parent, true) | ||
416 | |||
417 | /** | ||
418 | * closure_init() - Initialize a closure, setting the refcount to 1 | ||
419 | * @cl: closure to initialize | ||
420 | * @parent: parent of the new closure. cl will take a refcount on it for its | ||
421 | * lifetime; may be NULL. | ||
422 | */ | ||
423 | #define closure_init(cl, parent) \ | ||
424 | do { \ | ||
425 | memset((cl), 0, sizeof(*(cl))); \ | ||
426 | __closure_init(cl, parent); \ | ||
427 | } while (0) | ||
428 | |||
429 | static inline void closure_init_stack(struct closure *cl) | ||
430 | { | ||
431 | memset(cl, 0, sizeof(struct closure)); | ||
432 | atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER| | ||
433 | CLOSURE_BLOCKING|CLOSURE_STACK); | ||
434 | } | ||
435 | |||
436 | /** | ||
437 | * closure_init_unlocked() - Initialize a closure but leave it unlocked. | ||
438 | * @cl: closure to initialize | ||
439 | * | ||
440 | * For when the closure will be used as a lock. The closure may not be used | ||
441 | * until after a closure_lock() or closure_trylock(). | ||
442 | */ | ||
443 | #define closure_init_unlocked(cl) \ | ||
444 | do { \ | ||
445 | memset((cl), 0, sizeof(*(cl))); \ | ||
446 | closure_init_type(cl, NULL, false); \ | ||
447 | } while (0) | ||
448 | |||
449 | /** | ||
450 | * closure_lock() - lock and initialize a closure. | ||
451 | * @cl: the closure to lock | ||
452 | * @parent: the new parent for this closure | ||
453 | * | ||
454 | * The closure must be of one of the types that has a waitlist (otherwise we | ||
455 | * wouldn't be able to sleep on contention). | ||
456 | * | ||
457 | * @parent has exactly the same meaning as in closure_init(); if non null, the | ||
458 | * closure will take a reference on @parent which will be released when it is | ||
459 | * unlocked. | ||
460 | */ | ||
461 | #define closure_lock(cl, parent) \ | ||
462 | __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) | ||
463 | |||
464 | /** | ||
465 | * closure_delay() - delay some number of jiffies | ||
466 | * @cl: the closure that will sleep | ||
467 | * @delay: the delay in jiffies | ||
468 | * | ||
469 | * Takes a refcount on @cl which will be released after @delay jiffies; this may | ||
470 | * be used to have a function run after a delay with continue_at(), or | ||
471 | * closure_sync() may be used for a convoluted version of msleep(). | ||
472 | */ | ||
473 | #define closure_delay(cl, delay) \ | ||
474 | __closure_delay(__to_internal_closure(cl), delay, &(cl)->timer) | ||
475 | |||
476 | #define closure_flush(cl) \ | ||
477 | __closure_flush(__to_internal_closure(cl), &(cl)->timer) | ||
478 | |||
479 | #define closure_flush_sync(cl) \ | ||
480 | __closure_flush_sync(__to_internal_closure(cl), &(cl)->timer) | ||
481 | |||
482 | static inline void __closure_end_sleep(struct closure *cl) | ||
483 | { | ||
484 | __set_current_state(TASK_RUNNING); | ||
485 | |||
486 | if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) | ||
487 | atomic_sub(CLOSURE_SLEEPING, &cl->remaining); | ||
488 | } | ||
489 | |||
490 | static inline void __closure_start_sleep(struct closure *cl) | ||
491 | { | ||
492 | closure_set_ip(cl); | ||
493 | cl->task = current; | ||
494 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
495 | |||
496 | if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) | ||
497 | atomic_add(CLOSURE_SLEEPING, &cl->remaining); | ||
498 | } | ||
499 | |||
500 | /** | ||
501 | * closure_blocking() - returns true if the closure is in blocking mode. | ||
502 | * | ||
503 | * If a closure is in blocking mode, closure_wait_event() will sleep until the | ||
504 | * condition is true instead of waiting asynchronously. | ||
505 | */ | ||
506 | static inline bool closure_blocking(struct closure *cl) | ||
507 | { | ||
508 | return atomic_read(&cl->remaining) & CLOSURE_BLOCKING; | ||
509 | } | ||
510 | |||
511 | /** | ||
512 | * set_closure_blocking() - put a closure in blocking mode. | ||
513 | * | ||
514 | * If a closure is in blocking mode, closure_wait_event() will sleep until the | ||
515 | * condition is true instead of waiting asynchronously. | ||
516 | * | ||
517 | * Not thread safe - can only be called by the thread running the closure. | ||
518 | */ | ||
519 | static inline void set_closure_blocking(struct closure *cl) | ||
520 | { | ||
521 | if (!closure_blocking(cl)) | ||
522 | atomic_add(CLOSURE_BLOCKING, &cl->remaining); | ||
523 | } | ||
524 | |||
525 | /* | ||
526 | * Not thread safe - can only be called by the thread running the closure. | ||
527 | */ | ||
528 | static inline void clear_closure_blocking(struct closure *cl) | ||
529 | { | ||
530 | if (closure_blocking(cl)) | ||
531 | atomic_sub(CLOSURE_BLOCKING, &cl->remaining); | ||
532 | } | ||
533 | |||
534 | /** | ||
535 | * closure_wake_up() - wake up all closures on a wait list. | ||
536 | */ | ||
537 | static inline void closure_wake_up(struct closure_waitlist *list) | ||
538 | { | ||
539 | smp_mb(); | ||
540 | __closure_wake_up(list); | ||
541 | } | ||
542 | |||
543 | /* | ||
544 | * Wait on an event, synchronously or asynchronously - analogous to wait_event() | ||
545 | * but for closures. | ||
546 | * | ||
547 | * The loop is oddly structured so as to avoid a race; we must check the | ||
548 | * condition again after we've added ourself to the waitlist. We know if we were | ||
549 | * already on the waitlist because closure_wait() returns false; thus, we only | ||
550 | * schedule or break if closure_wait() returns false. If it returns true, we | ||
551 | * just loop again - rechecking the condition. | ||
552 | * | ||
553 | * The __closure_wake_up() is necessary because we may race with the event | ||
554 | * becoming true; i.e. we see event false -> wait -> recheck condition, but the | ||
555 | * thread that made the event true may have called closure_wake_up() before we | ||
556 | * added ourself to the wait list. | ||
557 | * | ||
558 | * We have to call closure_sync() at the end instead of just | ||
559 | * __closure_end_sleep() because a different thread might've called | ||
560 | * closure_wake_up() before us and gotten preempted before they dropped the | ||
561 | * refcount on our closure. If this was a stack allocated closure, that would be | ||
562 | * bad. | ||
563 | */ | ||
564 | #define __closure_wait_event(list, cl, condition, _block) \ | ||
565 | ({ \ | ||
566 | bool block = _block; \ | ||
567 | typeof(condition) ret; \ | ||
568 | \ | ||
569 | while (1) { \ | ||
570 | ret = (condition); \ | ||
571 | if (ret) { \ | ||
572 | __closure_wake_up(list); \ | ||
573 | if (block) \ | ||
574 | closure_sync(cl); \ | ||
575 | \ | ||
576 | break; \ | ||
577 | } \ | ||
578 | \ | ||
579 | if (block) \ | ||
580 | __closure_start_sleep(cl); \ | ||
581 | \ | ||
582 | if (!closure_wait(list, cl)) { \ | ||
583 | if (!block) \ | ||
584 | break; \ | ||
585 | \ | ||
586 | schedule(); \ | ||
587 | } \ | ||
588 | } \ | ||
589 | \ | ||
590 | ret; \ | ||
591 | }) | ||
592 | |||
593 | /** | ||
594 | * closure_wait_event() - wait on a condition, synchronously or asynchronously. | ||
595 | * @list: the wait list to wait on | ||
596 | * @cl: the closure that is doing the waiting | ||
597 | * @condition: a C expression for the event to wait for | ||
598 | * | ||
599 | * If the closure is in blocking mode, sleeps until the @condition evaluates to | ||
600 | * true - exactly like wait_event(). | ||
601 | * | ||
602 | * If the closure is not in blocking mode, waits asynchronously; if the | ||
603 | * condition is currently false the @cl is put onto @list and returns. @list | ||
604 | * owns a refcount on @cl; closure_sync() or continue_at() may be used later to | ||
605 | * wait for another thread to wake up @list, which drops the refcount on @cl. | ||
606 | * | ||
607 | * Returns the value of @condition; @cl will be on @list iff @condition was | ||
608 | * false. | ||
609 | * | ||
610 | * closure_wake_up(@list) must be called after changing any variable that could | ||
611 | * cause @condition to become true. | ||
612 | */ | ||
613 | #define closure_wait_event(list, cl, condition) \ | ||
614 | __closure_wait_event(list, cl, condition, closure_blocking(cl)) | ||
615 | |||
616 | #define closure_wait_event_async(list, cl, condition) \ | ||
617 | __closure_wait_event(list, cl, condition, false) | ||
618 | |||
619 | #define closure_wait_event_sync(list, cl, condition) \ | ||
620 | __closure_wait_event(list, cl, condition, true) | ||
621 | |||
622 | static inline void set_closure_fn(struct closure *cl, closure_fn *fn, | ||
623 | struct workqueue_struct *wq) | ||
624 | { | ||
625 | BUG_ON(object_is_on_stack(cl)); | ||
626 | closure_set_ip(cl); | ||
627 | cl->fn = fn; | ||
628 | cl->wq = wq; | ||
629 | /* between atomic_dec() in closure_put() */ | ||
630 | smp_mb__before_atomic_dec(); | ||
631 | } | ||
632 | |||
633 | #define continue_at(_cl, _fn, _wq) \ | ||
634 | do { \ | ||
635 | set_closure_fn(_cl, _fn, _wq); \ | ||
636 | closure_sub(_cl, CLOSURE_RUNNING + 1); \ | ||
637 | return; \ | ||
638 | } while (0) | ||
639 | |||
640 | #define closure_return(_cl) continue_at((_cl), NULL, NULL) | ||
641 | |||
642 | #define continue_at_nobarrier(_cl, _fn, _wq) \ | ||
643 | do { \ | ||
644 | set_closure_fn(_cl, _fn, _wq); \ | ||
645 | closure_queue(cl); \ | ||
646 | return; \ | ||
647 | } while (0) | ||
648 | |||
649 | #define closure_return_with_destructor(_cl, _destructor) \ | ||
650 | do { \ | ||
651 | set_closure_fn(_cl, _destructor, NULL); \ | ||
652 | closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ | ||
653 | return; \ | ||
654 | } while (0) | ||
655 | |||
656 | static inline void closure_call(struct closure *cl, closure_fn fn, | ||
657 | struct workqueue_struct *wq, | ||
658 | struct closure *parent) | ||
659 | { | ||
660 | closure_init(cl, parent); | ||
661 | continue_at_nobarrier(cl, fn, wq); | ||
662 | } | ||
663 | |||
664 | static inline void closure_trylock_call(struct closure *cl, closure_fn fn, | ||
665 | struct workqueue_struct *wq, | ||
666 | struct closure *parent) | ||
667 | { | ||
668 | if (closure_trylock(cl, parent)) | ||
669 | continue_at_nobarrier(cl, fn, wq); | ||
670 | } | ||
671 | |||
672 | #endif /* _LINUX_CLOSURE_H */ | ||
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c new file mode 100644 index 000000000000..89fd5204924e --- /dev/null +++ b/drivers/md/bcache/debug.c | |||
@@ -0,0 +1,565 @@ | |||
1 | /* | ||
2 | * Assorted bcache debug code | ||
3 | * | ||
4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
5 | * Copyright 2012 Google, Inc. | ||
6 | */ | ||
7 | |||
8 | #include "bcache.h" | ||
9 | #include "btree.h" | ||
10 | #include "debug.h" | ||
11 | #include "request.h" | ||
12 | |||
13 | #include <linux/console.h> | ||
14 | #include <linux/debugfs.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/random.h> | ||
17 | #include <linux/seq_file.h> | ||
18 | |||
19 | static struct dentry *debug; | ||
20 | |||
21 | const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) | ||
22 | { | ||
23 | unsigned i; | ||
24 | |||
25 | for (i = 0; i < KEY_PTRS(k); i++) | ||
26 | if (ptr_available(c, k, i)) { | ||
27 | struct cache *ca = PTR_CACHE(c, k, i); | ||
28 | size_t bucket = PTR_BUCKET_NR(c, k, i); | ||
29 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | ||
30 | |||
31 | if (KEY_SIZE(k) + r > c->sb.bucket_size) | ||
32 | return "bad, length too big"; | ||
33 | if (bucket < ca->sb.first_bucket) | ||
34 | return "bad, short offset"; | ||
35 | if (bucket >= ca->sb.nbuckets) | ||
36 | return "bad, offset past end of device"; | ||
37 | if (ptr_stale(c, k, i)) | ||
38 | return "stale"; | ||
39 | } | ||
40 | |||
41 | if (!bkey_cmp(k, &ZERO_KEY)) | ||
42 | return "bad, null key"; | ||
43 | if (!KEY_PTRS(k)) | ||
44 | return "bad, no pointers"; | ||
45 | if (!KEY_SIZE(k)) | ||
46 | return "zeroed key"; | ||
47 | return ""; | ||
48 | } | ||
49 | |||
50 | struct keyprint_hack bch_pkey(const struct bkey *k) | ||
51 | { | ||
52 | unsigned i = 0; | ||
53 | struct keyprint_hack r; | ||
54 | char *out = r.s, *end = r.s + KEYHACK_SIZE; | ||
55 | |||
56 | #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) | ||
57 | |||
58 | p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); | ||
59 | |||
60 | if (KEY_PTRS(k)) | ||
61 | while (1) { | ||
62 | p("%llu:%llu gen %llu", | ||
63 | PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); | ||
64 | |||
65 | if (++i == KEY_PTRS(k)) | ||
66 | break; | ||
67 | |||
68 | p(", "); | ||
69 | } | ||
70 | |||
71 | p("]"); | ||
72 | |||
73 | if (KEY_DIRTY(k)) | ||
74 | p(" dirty"); | ||
75 | if (KEY_CSUM(k)) | ||
76 | p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); | ||
77 | #undef p | ||
78 | return r; | ||
79 | } | ||
80 | |||
81 | struct keyprint_hack bch_pbtree(const struct btree *b) | ||
82 | { | ||
83 | struct keyprint_hack r; | ||
84 | |||
85 | snprintf(r.s, 40, "%zu level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0), | ||
86 | b->level, b->c->root ? b->c->root->level : -1); | ||
87 | return r; | ||
88 | } | ||
89 | |||
90 | #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) | ||
91 | |||
92 | static bool skipped_backwards(struct btree *b, struct bkey *k) | ||
93 | { | ||
94 | return bkey_cmp(k, (!b->level) | ||
95 | ? &START_KEY(bkey_next(k)) | ||
96 | : bkey_next(k)) > 0; | ||
97 | } | ||
98 | |||
99 | static void dump_bset(struct btree *b, struct bset *i) | ||
100 | { | ||
101 | struct bkey *k; | ||
102 | unsigned j; | ||
103 | |||
104 | for (k = i->start; k < end(i); k = bkey_next(k)) { | ||
105 | printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), | ||
106 | (uint64_t *) k - i->d, i->keys, pkey(k)); | ||
107 | |||
108 | for (j = 0; j < KEY_PTRS(k); j++) { | ||
109 | size_t n = PTR_BUCKET_NR(b->c, k, j); | ||
110 | printk(" bucket %zu", n); | ||
111 | |||
112 | if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) | ||
113 | printk(" prio %i", | ||
114 | PTR_BUCKET(b->c, k, j)->prio); | ||
115 | } | ||
116 | |||
117 | printk(" %s\n", bch_ptr_status(b->c, k)); | ||
118 | |||
119 | if (bkey_next(k) < end(i) && | ||
120 | skipped_backwards(b, k)) | ||
121 | printk(KERN_ERR "Key skipped backwards\n"); | ||
122 | } | ||
123 | } | ||
124 | |||
125 | #endif | ||
126 | |||
127 | #ifdef CONFIG_BCACHE_DEBUG | ||
128 | |||
129 | void bch_btree_verify(struct btree *b, struct bset *new) | ||
130 | { | ||
131 | struct btree *v = b->c->verify_data; | ||
132 | struct closure cl; | ||
133 | closure_init_stack(&cl); | ||
134 | |||
135 | if (!b->c->verify) | ||
136 | return; | ||
137 | |||
138 | closure_wait_event(&b->io.wait, &cl, | ||
139 | atomic_read(&b->io.cl.remaining) == -1); | ||
140 | |||
141 | mutex_lock(&b->c->verify_lock); | ||
142 | |||
143 | bkey_copy(&v->key, &b->key); | ||
144 | v->written = 0; | ||
145 | v->level = b->level; | ||
146 | |||
147 | bch_btree_read(v); | ||
148 | closure_wait_event(&v->io.wait, &cl, | ||
149 | atomic_read(&b->io.cl.remaining) == -1); | ||
150 | |||
151 | if (new->keys != v->sets[0].data->keys || | ||
152 | memcmp(new->start, | ||
153 | v->sets[0].data->start, | ||
154 | (void *) end(new) - (void *) new->start)) { | ||
155 | unsigned i, j; | ||
156 | |||
157 | console_lock(); | ||
158 | |||
159 | printk(KERN_ERR "*** original memory node:\n"); | ||
160 | for (i = 0; i <= b->nsets; i++) | ||
161 | dump_bset(b, b->sets[i].data); | ||
162 | |||
163 | printk(KERN_ERR "*** sorted memory node:\n"); | ||
164 | dump_bset(b, new); | ||
165 | |||
166 | printk(KERN_ERR "*** on disk node:\n"); | ||
167 | dump_bset(v, v->sets[0].data); | ||
168 | |||
169 | for (j = 0; j < new->keys; j++) | ||
170 | if (new->d[j] != v->sets[0].data->d[j]) | ||
171 | break; | ||
172 | |||
173 | console_unlock(); | ||
174 | panic("verify failed at %u\n", j); | ||
175 | } | ||
176 | |||
177 | mutex_unlock(&b->c->verify_lock); | ||
178 | } | ||
179 | |||
180 | static void data_verify_endio(struct bio *bio, int error) | ||
181 | { | ||
182 | struct closure *cl = bio->bi_private; | ||
183 | closure_put(cl); | ||
184 | } | ||
185 | |||
186 | void bch_data_verify(struct search *s) | ||
187 | { | ||
188 | char name[BDEVNAME_SIZE]; | ||
189 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
190 | struct closure *cl = &s->cl; | ||
191 | struct bio *check; | ||
192 | struct bio_vec *bv; | ||
193 | int i; | ||
194 | |||
195 | if (!s->unaligned_bvec) | ||
196 | bio_for_each_segment(bv, s->orig_bio, i) | ||
197 | bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; | ||
198 | |||
199 | check = bio_clone(s->orig_bio, GFP_NOIO); | ||
200 | if (!check) | ||
201 | return; | ||
202 | |||
203 | if (bch_bio_alloc_pages(check, GFP_NOIO)) | ||
204 | goto out_put; | ||
205 | |||
206 | check->bi_rw = READ_SYNC; | ||
207 | check->bi_private = cl; | ||
208 | check->bi_end_io = data_verify_endio; | ||
209 | |||
210 | closure_bio_submit(check, cl, &dc->disk); | ||
211 | closure_sync(cl); | ||
212 | |||
213 | bio_for_each_segment(bv, s->orig_bio, i) { | ||
214 | void *p1 = kmap(bv->bv_page); | ||
215 | void *p2 = kmap(check->bi_io_vec[i].bv_page); | ||
216 | |||
217 | if (memcmp(p1 + bv->bv_offset, | ||
218 | p2 + bv->bv_offset, | ||
219 | bv->bv_len)) | ||
220 | printk(KERN_ERR | ||
221 | "bcache (%s): verify failed at sector %llu\n", | ||
222 | bdevname(dc->bdev, name), | ||
223 | (uint64_t) s->orig_bio->bi_sector); | ||
224 | |||
225 | kunmap(bv->bv_page); | ||
226 | kunmap(check->bi_io_vec[i].bv_page); | ||
227 | } | ||
228 | |||
229 | __bio_for_each_segment(bv, check, i, 0) | ||
230 | __free_page(bv->bv_page); | ||
231 | out_put: | ||
232 | bio_put(check); | ||
233 | } | ||
234 | |||
235 | #endif | ||
236 | |||
237 | #ifdef CONFIG_BCACHE_EDEBUG | ||
238 | |||
239 | unsigned bch_count_data(struct btree *b) | ||
240 | { | ||
241 | unsigned ret = 0; | ||
242 | struct btree_iter iter; | ||
243 | struct bkey *k; | ||
244 | |||
245 | if (!b->level) | ||
246 | for_each_key(b, k, &iter) | ||
247 | ret += KEY_SIZE(k); | ||
248 | return ret; | ||
249 | } | ||
250 | |||
251 | static void vdump_bucket_and_panic(struct btree *b, const char *fmt, | ||
252 | va_list args) | ||
253 | { | ||
254 | unsigned i; | ||
255 | |||
256 | console_lock(); | ||
257 | |||
258 | for (i = 0; i <= b->nsets; i++) | ||
259 | dump_bset(b, b->sets[i].data); | ||
260 | |||
261 | vprintk(fmt, args); | ||
262 | |||
263 | console_unlock(); | ||
264 | |||
265 | panic("at %s\n", pbtree(b)); | ||
266 | } | ||
267 | |||
268 | void bch_check_key_order_msg(struct btree *b, struct bset *i, | ||
269 | const char *fmt, ...) | ||
270 | { | ||
271 | struct bkey *k; | ||
272 | |||
273 | if (!i->keys) | ||
274 | return; | ||
275 | |||
276 | for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k)) | ||
277 | if (skipped_backwards(b, k)) { | ||
278 | va_list args; | ||
279 | va_start(args, fmt); | ||
280 | |||
281 | vdump_bucket_and_panic(b, fmt, args); | ||
282 | va_end(args); | ||
283 | } | ||
284 | } | ||
285 | |||
286 | void bch_check_keys(struct btree *b, const char *fmt, ...) | ||
287 | { | ||
288 | va_list args; | ||
289 | struct bkey *k, *p = NULL; | ||
290 | struct btree_iter iter; | ||
291 | |||
292 | if (b->level) | ||
293 | return; | ||
294 | |||
295 | for_each_key(b, k, &iter) { | ||
296 | if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { | ||
297 | printk(KERN_ERR "Keys out of order:\n"); | ||
298 | goto bug; | ||
299 | } | ||
300 | |||
301 | if (bch_ptr_invalid(b, k)) | ||
302 | continue; | ||
303 | |||
304 | if (p && bkey_cmp(p, &START_KEY(k)) > 0) { | ||
305 | printk(KERN_ERR "Overlapping keys:\n"); | ||
306 | goto bug; | ||
307 | } | ||
308 | p = k; | ||
309 | } | ||
310 | return; | ||
311 | bug: | ||
312 | va_start(args, fmt); | ||
313 | vdump_bucket_and_panic(b, fmt, args); | ||
314 | va_end(args); | ||
315 | } | ||
316 | |||
317 | #endif | ||
318 | |||
319 | #ifdef CONFIG_DEBUG_FS | ||
320 | |||
321 | /* XXX: cache set refcounting */ | ||
322 | |||
323 | struct dump_iterator { | ||
324 | char buf[PAGE_SIZE]; | ||
325 | size_t bytes; | ||
326 | struct cache_set *c; | ||
327 | struct keybuf keys; | ||
328 | }; | ||
329 | |||
330 | static bool dump_pred(struct keybuf *buf, struct bkey *k) | ||
331 | { | ||
332 | return true; | ||
333 | } | ||
334 | |||
335 | static ssize_t bch_dump_read(struct file *file, char __user *buf, | ||
336 | size_t size, loff_t *ppos) | ||
337 | { | ||
338 | struct dump_iterator *i = file->private_data; | ||
339 | ssize_t ret = 0; | ||
340 | |||
341 | while (size) { | ||
342 | struct keybuf_key *w; | ||
343 | unsigned bytes = min(i->bytes, size); | ||
344 | |||
345 | int err = copy_to_user(buf, i->buf, bytes); | ||
346 | if (err) | ||
347 | return err; | ||
348 | |||
349 | ret += bytes; | ||
350 | buf += bytes; | ||
351 | size -= bytes; | ||
352 | i->bytes -= bytes; | ||
353 | memmove(i->buf, i->buf + bytes, i->bytes); | ||
354 | |||
355 | if (i->bytes) | ||
356 | break; | ||
357 | |||
358 | w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY); | ||
359 | if (!w) | ||
360 | break; | ||
361 | |||
362 | i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key)); | ||
363 | bch_keybuf_del(&i->keys, w); | ||
364 | } | ||
365 | |||
366 | return ret; | ||
367 | } | ||
368 | |||
369 | static int bch_dump_open(struct inode *inode, struct file *file) | ||
370 | { | ||
371 | struct cache_set *c = inode->i_private; | ||
372 | struct dump_iterator *i; | ||
373 | |||
374 | i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL); | ||
375 | if (!i) | ||
376 | return -ENOMEM; | ||
377 | |||
378 | file->private_data = i; | ||
379 | i->c = c; | ||
380 | bch_keybuf_init(&i->keys, dump_pred); | ||
381 | i->keys.last_scanned = KEY(0, 0, 0); | ||
382 | |||
383 | return 0; | ||
384 | } | ||
385 | |||
386 | static int bch_dump_release(struct inode *inode, struct file *file) | ||
387 | { | ||
388 | kfree(file->private_data); | ||
389 | return 0; | ||
390 | } | ||
391 | |||
392 | static const struct file_operations cache_set_debug_ops = { | ||
393 | .owner = THIS_MODULE, | ||
394 | .open = bch_dump_open, | ||
395 | .read = bch_dump_read, | ||
396 | .release = bch_dump_release | ||
397 | }; | ||
398 | |||
399 | void bch_debug_init_cache_set(struct cache_set *c) | ||
400 | { | ||
401 | if (!IS_ERR_OR_NULL(debug)) { | ||
402 | char name[50]; | ||
403 | snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); | ||
404 | |||
405 | c->debug = debugfs_create_file(name, 0400, debug, c, | ||
406 | &cache_set_debug_ops); | ||
407 | } | ||
408 | } | ||
409 | |||
410 | #endif | ||
411 | |||
412 | /* Fuzz tester has rotted: */ | ||
413 | #if 0 | ||
414 | |||
415 | static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a, | ||
416 | const char *buffer, size_t size) | ||
417 | { | ||
418 | void dump(struct btree *b) | ||
419 | { | ||
420 | struct bset *i; | ||
421 | |||
422 | for (i = b->sets[0].data; | ||
423 | index(i, b) < btree_blocks(b) && | ||
424 | i->seq == b->sets[0].data->seq; | ||
425 | i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c)) | ||
426 | dump_bset(b, i); | ||
427 | } | ||
428 | |||
429 | struct cache_sb *sb; | ||
430 | struct cache_set *c; | ||
431 | struct btree *all[3], *b, *fill, *orig; | ||
432 | int j; | ||
433 | |||
434 | struct btree_op op; | ||
435 | bch_btree_op_init_stack(&op); | ||
436 | |||
437 | sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL); | ||
438 | if (!sb) | ||
439 | return -ENOMEM; | ||
440 | |||
441 | sb->bucket_size = 128; | ||
442 | sb->block_size = 4; | ||
443 | |||
444 | c = bch_cache_set_alloc(sb); | ||
445 | if (!c) | ||
446 | return -ENOMEM; | ||
447 | |||
448 | for (j = 0; j < 3; j++) { | ||
449 | BUG_ON(list_empty(&c->btree_cache)); | ||
450 | all[j] = list_first_entry(&c->btree_cache, struct btree, list); | ||
451 | list_del_init(&all[j]->list); | ||
452 | |||
453 | all[j]->key = KEY(0, 0, c->sb.bucket_size); | ||
454 | bkey_copy_key(&all[j]->key, &MAX_KEY); | ||
455 | } | ||
456 | |||
457 | b = all[0]; | ||
458 | fill = all[1]; | ||
459 | orig = all[2]; | ||
460 | |||
461 | while (1) { | ||
462 | for (j = 0; j < 3; j++) | ||
463 | all[j]->written = all[j]->nsets = 0; | ||
464 | |||
465 | bch_bset_init_next(b); | ||
466 | |||
467 | while (1) { | ||
468 | struct bset *i = write_block(b); | ||
469 | struct bkey *k = op.keys.top; | ||
470 | unsigned rand; | ||
471 | |||
472 | bkey_init(k); | ||
473 | rand = get_random_int(); | ||
474 | |||
475 | op.type = rand & 1 | ||
476 | ? BTREE_INSERT | ||
477 | : BTREE_REPLACE; | ||
478 | rand >>= 1; | ||
479 | |||
480 | SET_KEY_SIZE(k, bucket_remainder(c, rand)); | ||
481 | rand >>= c->bucket_bits; | ||
482 | rand &= 1024 * 512 - 1; | ||
483 | rand += c->sb.bucket_size; | ||
484 | SET_KEY_OFFSET(k, rand); | ||
485 | #if 0 | ||
486 | SET_KEY_PTRS(k, 1); | ||
487 | #endif | ||
488 | bch_keylist_push(&op.keys); | ||
489 | bch_btree_insert_keys(b, &op); | ||
490 | |||
491 | if (should_split(b) || | ||
492 | set_blocks(i, b->c) != | ||
493 | __set_blocks(i, i->keys + 15, b->c)) { | ||
494 | i->csum = csum_set(i); | ||
495 | |||
496 | memcpy(write_block(fill), | ||
497 | i, set_bytes(i)); | ||
498 | |||
499 | b->written += set_blocks(i, b->c); | ||
500 | fill->written = b->written; | ||
501 | if (b->written == btree_blocks(b)) | ||
502 | break; | ||
503 | |||
504 | bch_btree_sort_lazy(b); | ||
505 | bch_bset_init_next(b); | ||
506 | } | ||
507 | } | ||
508 | |||
509 | memcpy(orig->sets[0].data, | ||
510 | fill->sets[0].data, | ||
511 | btree_bytes(c)); | ||
512 | |||
513 | bch_btree_sort(b); | ||
514 | fill->written = 0; | ||
515 | bch_btree_read_done(&fill->io.cl); | ||
516 | |||
517 | if (b->sets[0].data->keys != fill->sets[0].data->keys || | ||
518 | memcmp(b->sets[0].data->start, | ||
519 | fill->sets[0].data->start, | ||
520 | b->sets[0].data->keys * sizeof(uint64_t))) { | ||
521 | struct bset *i = b->sets[0].data; | ||
522 | struct bkey *k, *l; | ||
523 | |||
524 | for (k = i->start, | ||
525 | l = fill->sets[0].data->start; | ||
526 | k < end(i); | ||
527 | k = bkey_next(k), l = bkey_next(l)) | ||
528 | if (bkey_cmp(k, l) || | ||
529 | KEY_SIZE(k) != KEY_SIZE(l)) | ||
530 | pr_err("key %zi differs: %s != %s", | ||
531 | (uint64_t *) k - i->d, | ||
532 | pkey(k), pkey(l)); | ||
533 | |||
534 | for (j = 0; j < 3; j++) { | ||
535 | pr_err("**** Set %i ****", j); | ||
536 | dump(all[j]); | ||
537 | } | ||
538 | panic("\n"); | ||
539 | } | ||
540 | |||
541 | pr_info("fuzz complete: %i keys", b->sets[0].data->keys); | ||
542 | } | ||
543 | } | ||
544 | |||
545 | kobj_attribute_write(fuzz, btree_fuzz); | ||
546 | #endif | ||
547 | |||
548 | void bch_debug_exit(void) | ||
549 | { | ||
550 | if (!IS_ERR_OR_NULL(debug)) | ||
551 | debugfs_remove_recursive(debug); | ||
552 | } | ||
553 | |||
554 | int __init bch_debug_init(struct kobject *kobj) | ||
555 | { | ||
556 | int ret = 0; | ||
557 | #if 0 | ||
558 | ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr); | ||
559 | if (ret) | ||
560 | return ret; | ||
561 | #endif | ||
562 | |||
563 | debug = debugfs_create_dir("bcache", NULL); | ||
564 | return ret; | ||
565 | } | ||
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h new file mode 100644 index 000000000000..f9378a218148 --- /dev/null +++ b/drivers/md/bcache/debug.h | |||
@@ -0,0 +1,54 @@ | |||
1 | #ifndef _BCACHE_DEBUG_H | ||
2 | #define _BCACHE_DEBUG_H | ||
3 | |||
4 | /* Btree/bkey debug printing */ | ||
5 | |||
6 | #define KEYHACK_SIZE 80 | ||
7 | struct keyprint_hack { | ||
8 | char s[KEYHACK_SIZE]; | ||
9 | }; | ||
10 | |||
11 | struct keyprint_hack bch_pkey(const struct bkey *k); | ||
12 | struct keyprint_hack bch_pbtree(const struct btree *b); | ||
13 | #define pkey(k) (&bch_pkey(k).s[0]) | ||
14 | #define pbtree(b) (&bch_pbtree(b).s[0]) | ||
15 | |||
16 | #ifdef CONFIG_BCACHE_EDEBUG | ||
17 | |||
18 | unsigned bch_count_data(struct btree *); | ||
19 | void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...); | ||
20 | void bch_check_keys(struct btree *, const char *, ...); | ||
21 | |||
22 | #define bch_check_key_order(b, i) \ | ||
23 | bch_check_key_order_msg(b, i, "keys out of order") | ||
24 | #define EBUG_ON(cond) BUG_ON(cond) | ||
25 | |||
26 | #else /* EDEBUG */ | ||
27 | |||
28 | #define bch_count_data(b) 0 | ||
29 | #define bch_check_key_order(b, i) do {} while (0) | ||
30 | #define bch_check_key_order_msg(b, i, ...) do {} while (0) | ||
31 | #define bch_check_keys(b, ...) do {} while (0) | ||
32 | #define EBUG_ON(cond) do {} while (0) | ||
33 | |||
34 | #endif | ||
35 | |||
36 | #ifdef CONFIG_BCACHE_DEBUG | ||
37 | |||
38 | void bch_btree_verify(struct btree *, struct bset *); | ||
39 | void bch_data_verify(struct search *); | ||
40 | |||
41 | #else /* DEBUG */ | ||
42 | |||
43 | static inline void bch_btree_verify(struct btree *b, struct bset *i) {} | ||
44 | static inline void bch_data_verify(struct search *s) {}; | ||
45 | |||
46 | #endif | ||
47 | |||
48 | #ifdef CONFIG_DEBUG_FS | ||
49 | void bch_debug_init_cache_set(struct cache_set *); | ||
50 | #else | ||
51 | static inline void bch_debug_init_cache_set(struct cache_set *c) {} | ||
52 | #endif | ||
53 | |||
54 | #endif | ||
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c new file mode 100644 index 000000000000..48efd4dea645 --- /dev/null +++ b/drivers/md/bcache/io.c | |||
@@ -0,0 +1,397 @@ | |||
1 | /* | ||
2 | * Some low level IO code, and hacks for various block layer limitations | ||
3 | * | ||
4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
5 | * Copyright 2012 Google, Inc. | ||
6 | */ | ||
7 | |||
8 | #include "bcache.h" | ||
9 | #include "bset.h" | ||
10 | #include "debug.h" | ||
11 | |||
12 | static void bch_bi_idx_hack_endio(struct bio *bio, int error) | ||
13 | { | ||
14 | struct bio *p = bio->bi_private; | ||
15 | |||
16 | bio_endio(p, error); | ||
17 | bio_put(bio); | ||
18 | } | ||
19 | |||
20 | static void bch_generic_make_request_hack(struct bio *bio) | ||
21 | { | ||
22 | if (bio->bi_idx) { | ||
23 | struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio)); | ||
24 | |||
25 | memcpy(clone->bi_io_vec, | ||
26 | bio_iovec(bio), | ||
27 | bio_segments(bio) * sizeof(struct bio_vec)); | ||
28 | |||
29 | clone->bi_sector = bio->bi_sector; | ||
30 | clone->bi_bdev = bio->bi_bdev; | ||
31 | clone->bi_rw = bio->bi_rw; | ||
32 | clone->bi_vcnt = bio_segments(bio); | ||
33 | clone->bi_size = bio->bi_size; | ||
34 | |||
35 | clone->bi_private = bio; | ||
36 | clone->bi_end_io = bch_bi_idx_hack_endio; | ||
37 | |||
38 | bio = clone; | ||
39 | } | ||
40 | |||
41 | /* | ||
42 | * Hack, since drivers that clone bios clone up to bi_max_vecs, but our | ||
43 | * bios might have had more than that (before we split them per device | ||
44 | * limitations). | ||
45 | * | ||
46 | * To be taken out once immutable bvec stuff is in. | ||
47 | */ | ||
48 | bio->bi_max_vecs = bio->bi_vcnt; | ||
49 | |||
50 | generic_make_request(bio); | ||
51 | } | ||
52 | |||
53 | /** | ||
54 | * bch_bio_split - split a bio | ||
55 | * @bio: bio to split | ||
56 | * @sectors: number of sectors to split from the front of @bio | ||
57 | * @gfp: gfp mask | ||
58 | * @bs: bio set to allocate from | ||
59 | * | ||
60 | * Allocates and returns a new bio which represents @sectors from the start of | ||
61 | * @bio, and updates @bio to represent the remaining sectors. | ||
62 | * | ||
63 | * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio | ||
64 | * unchanged. | ||
65 | * | ||
66 | * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a | ||
67 | * bvec boundry; it is the caller's responsibility to ensure that @bio is not | ||
68 | * freed before the split. | ||
69 | * | ||
70 | * If bch_bio_split() is running under generic_make_request(), it's not safe to | ||
71 | * allocate more than one bio from the same bio set. Therefore, if it is running | ||
72 | * under generic_make_request() it masks out __GFP_WAIT when doing the | ||
73 | * allocation. The caller must check for failure if there's any possibility of | ||
74 | * it being called from under generic_make_request(); it is then the caller's | ||
75 | * responsibility to retry from a safe context (by e.g. punting to workqueue). | ||
76 | */ | ||
77 | struct bio *bch_bio_split(struct bio *bio, int sectors, | ||
78 | gfp_t gfp, struct bio_set *bs) | ||
79 | { | ||
80 | unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9; | ||
81 | struct bio_vec *bv; | ||
82 | struct bio *ret = NULL; | ||
83 | |||
84 | BUG_ON(sectors <= 0); | ||
85 | |||
86 | /* | ||
87 | * If we're being called from underneath generic_make_request() and we | ||
88 | * already allocated any bios from this bio set, we risk deadlock if we | ||
89 | * use the mempool. So instead, we possibly fail and let the caller punt | ||
90 | * to workqueue or somesuch and retry in a safe context. | ||
91 | */ | ||
92 | if (current->bio_list) | ||
93 | gfp &= ~__GFP_WAIT; | ||
94 | |||
95 | if (sectors >= bio_sectors(bio)) | ||
96 | return bio; | ||
97 | |||
98 | if (bio->bi_rw & REQ_DISCARD) { | ||
99 | ret = bio_alloc_bioset(gfp, 1, bs); | ||
100 | idx = 0; | ||
101 | goto out; | ||
102 | } | ||
103 | |||
104 | bio_for_each_segment(bv, bio, idx) { | ||
105 | vcnt = idx - bio->bi_idx; | ||
106 | |||
107 | if (!nbytes) { | ||
108 | ret = bio_alloc_bioset(gfp, vcnt, bs); | ||
109 | if (!ret) | ||
110 | return NULL; | ||
111 | |||
112 | memcpy(ret->bi_io_vec, bio_iovec(bio), | ||
113 | sizeof(struct bio_vec) * vcnt); | ||
114 | |||
115 | break; | ||
116 | } else if (nbytes < bv->bv_len) { | ||
117 | ret = bio_alloc_bioset(gfp, ++vcnt, bs); | ||
118 | if (!ret) | ||
119 | return NULL; | ||
120 | |||
121 | memcpy(ret->bi_io_vec, bio_iovec(bio), | ||
122 | sizeof(struct bio_vec) * vcnt); | ||
123 | |||
124 | ret->bi_io_vec[vcnt - 1].bv_len = nbytes; | ||
125 | bv->bv_offset += nbytes; | ||
126 | bv->bv_len -= nbytes; | ||
127 | break; | ||
128 | } | ||
129 | |||
130 | nbytes -= bv->bv_len; | ||
131 | } | ||
132 | out: | ||
133 | ret->bi_bdev = bio->bi_bdev; | ||
134 | ret->bi_sector = bio->bi_sector; | ||
135 | ret->bi_size = sectors << 9; | ||
136 | ret->bi_rw = bio->bi_rw; | ||
137 | ret->bi_vcnt = vcnt; | ||
138 | ret->bi_max_vecs = vcnt; | ||
139 | |||
140 | bio->bi_sector += sectors; | ||
141 | bio->bi_size -= sectors << 9; | ||
142 | bio->bi_idx = idx; | ||
143 | |||
144 | if (bio_integrity(bio)) { | ||
145 | if (bio_integrity_clone(ret, bio, gfp)) { | ||
146 | bio_put(ret); | ||
147 | return NULL; | ||
148 | } | ||
149 | |||
150 | bio_integrity_trim(ret, 0, bio_sectors(ret)); | ||
151 | bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio)); | ||
152 | } | ||
153 | |||
154 | return ret; | ||
155 | } | ||
156 | |||
157 | static unsigned bch_bio_max_sectors(struct bio *bio) | ||
158 | { | ||
159 | unsigned ret = bio_sectors(bio); | ||
160 | struct request_queue *q = bdev_get_queue(bio->bi_bdev); | ||
161 | unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, | ||
162 | queue_max_segments(q)); | ||
163 | struct bio_vec *bv, *end = bio_iovec(bio) + | ||
164 | min_t(int, bio_segments(bio), max_segments); | ||
165 | |||
166 | if (bio->bi_rw & REQ_DISCARD) | ||
167 | return min(ret, q->limits.max_discard_sectors); | ||
168 | |||
169 | if (bio_segments(bio) > max_segments || | ||
170 | q->merge_bvec_fn) { | ||
171 | ret = 0; | ||
172 | |||
173 | for (bv = bio_iovec(bio); bv < end; bv++) { | ||
174 | struct bvec_merge_data bvm = { | ||
175 | .bi_bdev = bio->bi_bdev, | ||
176 | .bi_sector = bio->bi_sector, | ||
177 | .bi_size = ret << 9, | ||
178 | .bi_rw = bio->bi_rw, | ||
179 | }; | ||
180 | |||
181 | if (q->merge_bvec_fn && | ||
182 | q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) | ||
183 | break; | ||
184 | |||
185 | ret += bv->bv_len >> 9; | ||
186 | } | ||
187 | } | ||
188 | |||
189 | ret = min(ret, queue_max_sectors(q)); | ||
190 | |||
191 | WARN_ON(!ret); | ||
192 | ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9); | ||
193 | |||
194 | return ret; | ||
195 | } | ||
196 | |||
197 | static void bch_bio_submit_split_done(struct closure *cl) | ||
198 | { | ||
199 | struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); | ||
200 | |||
201 | s->bio->bi_end_io = s->bi_end_io; | ||
202 | s->bio->bi_private = s->bi_private; | ||
203 | bio_endio(s->bio, 0); | ||
204 | |||
205 | closure_debug_destroy(&s->cl); | ||
206 | mempool_free(s, s->p->bio_split_hook); | ||
207 | } | ||
208 | |||
209 | static void bch_bio_submit_split_endio(struct bio *bio, int error) | ||
210 | { | ||
211 | struct closure *cl = bio->bi_private; | ||
212 | struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); | ||
213 | |||
214 | if (error) | ||
215 | clear_bit(BIO_UPTODATE, &s->bio->bi_flags); | ||
216 | |||
217 | bio_put(bio); | ||
218 | closure_put(cl); | ||
219 | } | ||
220 | |||
221 | static void __bch_bio_submit_split(struct closure *cl) | ||
222 | { | ||
223 | struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); | ||
224 | struct bio *bio = s->bio, *n; | ||
225 | |||
226 | do { | ||
227 | n = bch_bio_split(bio, bch_bio_max_sectors(bio), | ||
228 | GFP_NOIO, s->p->bio_split); | ||
229 | if (!n) | ||
230 | continue_at(cl, __bch_bio_submit_split, system_wq); | ||
231 | |||
232 | n->bi_end_io = bch_bio_submit_split_endio; | ||
233 | n->bi_private = cl; | ||
234 | |||
235 | closure_get(cl); | ||
236 | bch_generic_make_request_hack(n); | ||
237 | } while (n != bio); | ||
238 | |||
239 | continue_at(cl, bch_bio_submit_split_done, NULL); | ||
240 | } | ||
241 | |||
242 | void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) | ||
243 | { | ||
244 | struct bio_split_hook *s; | ||
245 | |||
246 | if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) | ||
247 | goto submit; | ||
248 | |||
249 | if (bio_sectors(bio) <= bch_bio_max_sectors(bio)) | ||
250 | goto submit; | ||
251 | |||
252 | s = mempool_alloc(p->bio_split_hook, GFP_NOIO); | ||
253 | |||
254 | s->bio = bio; | ||
255 | s->p = p; | ||
256 | s->bi_end_io = bio->bi_end_io; | ||
257 | s->bi_private = bio->bi_private; | ||
258 | bio_get(bio); | ||
259 | |||
260 | closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL); | ||
261 | return; | ||
262 | submit: | ||
263 | bch_generic_make_request_hack(bio); | ||
264 | } | ||
265 | |||
266 | /* Bios with headers */ | ||
267 | |||
268 | void bch_bbio_free(struct bio *bio, struct cache_set *c) | ||
269 | { | ||
270 | struct bbio *b = container_of(bio, struct bbio, bio); | ||
271 | mempool_free(b, c->bio_meta); | ||
272 | } | ||
273 | |||
274 | struct bio *bch_bbio_alloc(struct cache_set *c) | ||
275 | { | ||
276 | struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO); | ||
277 | struct bio *bio = &b->bio; | ||
278 | |||
279 | bio_init(bio); | ||
280 | bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; | ||
281 | bio->bi_max_vecs = bucket_pages(c); | ||
282 | bio->bi_io_vec = bio->bi_inline_vecs; | ||
283 | |||
284 | return bio; | ||
285 | } | ||
286 | |||
287 | void __bch_submit_bbio(struct bio *bio, struct cache_set *c) | ||
288 | { | ||
289 | struct bbio *b = container_of(bio, struct bbio, bio); | ||
290 | |||
291 | bio->bi_sector = PTR_OFFSET(&b->key, 0); | ||
292 | bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev; | ||
293 | |||
294 | b->submit_time_us = local_clock_us(); | ||
295 | closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0)); | ||
296 | } | ||
297 | |||
298 | void bch_submit_bbio(struct bio *bio, struct cache_set *c, | ||
299 | struct bkey *k, unsigned ptr) | ||
300 | { | ||
301 | struct bbio *b = container_of(bio, struct bbio, bio); | ||
302 | bch_bkey_copy_single_ptr(&b->key, k, ptr); | ||
303 | __bch_submit_bbio(bio, c); | ||
304 | } | ||
305 | |||
306 | /* IO errors */ | ||
307 | |||
308 | void bch_count_io_errors(struct cache *ca, int error, const char *m) | ||
309 | { | ||
310 | /* | ||
311 | * The halflife of an error is: | ||
312 | * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh | ||
313 | */ | ||
314 | |||
315 | if (ca->set->error_decay) { | ||
316 | unsigned count = atomic_inc_return(&ca->io_count); | ||
317 | |||
318 | while (count > ca->set->error_decay) { | ||
319 | unsigned errors; | ||
320 | unsigned old = count; | ||
321 | unsigned new = count - ca->set->error_decay; | ||
322 | |||
323 | /* | ||
324 | * First we subtract refresh from count; each time we | ||
325 | * succesfully do so, we rescale the errors once: | ||
326 | */ | ||
327 | |||
328 | count = atomic_cmpxchg(&ca->io_count, old, new); | ||
329 | |||
330 | if (count == old) { | ||
331 | count = new; | ||
332 | |||
333 | errors = atomic_read(&ca->io_errors); | ||
334 | do { | ||
335 | old = errors; | ||
336 | new = ((uint64_t) errors * 127) / 128; | ||
337 | errors = atomic_cmpxchg(&ca->io_errors, | ||
338 | old, new); | ||
339 | } while (old != errors); | ||
340 | } | ||
341 | } | ||
342 | } | ||
343 | |||
344 | if (error) { | ||
345 | char buf[BDEVNAME_SIZE]; | ||
346 | unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, | ||
347 | &ca->io_errors); | ||
348 | errors >>= IO_ERROR_SHIFT; | ||
349 | |||
350 | if (errors < ca->set->error_limit) | ||
351 | pr_err("%s: IO error on %s, recovering", | ||
352 | bdevname(ca->bdev, buf), m); | ||
353 | else | ||
354 | bch_cache_set_error(ca->set, | ||
355 | "%s: too many IO errors %s", | ||
356 | bdevname(ca->bdev, buf), m); | ||
357 | } | ||
358 | } | ||
359 | |||
360 | void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, | ||
361 | int error, const char *m) | ||
362 | { | ||
363 | struct bbio *b = container_of(bio, struct bbio, bio); | ||
364 | struct cache *ca = PTR_CACHE(c, &b->key, 0); | ||
365 | |||
366 | unsigned threshold = bio->bi_rw & REQ_WRITE | ||
367 | ? c->congested_write_threshold_us | ||
368 | : c->congested_read_threshold_us; | ||
369 | |||
370 | if (threshold) { | ||
371 | unsigned t = local_clock_us(); | ||
372 | |||
373 | int us = t - b->submit_time_us; | ||
374 | int congested = atomic_read(&c->congested); | ||
375 | |||
376 | if (us > (int) threshold) { | ||
377 | int ms = us / 1024; | ||
378 | c->congested_last_us = t; | ||
379 | |||
380 | ms = min(ms, CONGESTED_MAX + congested); | ||
381 | atomic_sub(ms, &c->congested); | ||
382 | } else if (congested < 0) | ||
383 | atomic_inc(&c->congested); | ||
384 | } | ||
385 | |||
386 | bch_count_io_errors(ca, error, m); | ||
387 | } | ||
388 | |||
389 | void bch_bbio_endio(struct cache_set *c, struct bio *bio, | ||
390 | int error, const char *m) | ||
391 | { | ||
392 | struct closure *cl = bio->bi_private; | ||
393 | |||
394 | bch_bbio_count_io_errors(c, bio, error, m); | ||
395 | bio_put(bio); | ||
396 | closure_put(cl); | ||
397 | } | ||
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c new file mode 100644 index 000000000000..8c8dfdcd9d4c --- /dev/null +++ b/drivers/md/bcache/journal.c | |||
@@ -0,0 +1,787 @@ | |||
1 | /* | ||
2 | * bcache journalling code, for btree insertions | ||
3 | * | ||
4 | * Copyright 2012 Google, Inc. | ||
5 | */ | ||
6 | |||
7 | #include "bcache.h" | ||
8 | #include "btree.h" | ||
9 | #include "debug.h" | ||
10 | #include "request.h" | ||
11 | |||
12 | /* | ||
13 | * Journal replay/recovery: | ||
14 | * | ||
15 | * This code is all driven from run_cache_set(); we first read the journal | ||
16 | * entries, do some other stuff, then we mark all the keys in the journal | ||
17 | * entries (same as garbage collection would), then we replay them - reinserting | ||
18 | * them into the cache in precisely the same order as they appear in the | ||
19 | * journal. | ||
20 | * | ||
21 | * We only journal keys that go in leaf nodes, which simplifies things quite a | ||
22 | * bit. | ||
23 | */ | ||
24 | |||
25 | static void journal_read_endio(struct bio *bio, int error) | ||
26 | { | ||
27 | struct closure *cl = bio->bi_private; | ||
28 | closure_put(cl); | ||
29 | } | ||
30 | |||
31 | static int journal_read_bucket(struct cache *ca, struct list_head *list, | ||
32 | struct btree_op *op, unsigned bucket_index) | ||
33 | { | ||
34 | struct journal_device *ja = &ca->journal; | ||
35 | struct bio *bio = &ja->bio; | ||
36 | |||
37 | struct journal_replay *i; | ||
38 | struct jset *j, *data = ca->set->journal.w[0].data; | ||
39 | unsigned len, left, offset = 0; | ||
40 | int ret = 0; | ||
41 | sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); | ||
42 | |||
43 | pr_debug("reading %llu", (uint64_t) bucket); | ||
44 | |||
45 | while (offset < ca->sb.bucket_size) { | ||
46 | reread: left = ca->sb.bucket_size - offset; | ||
47 | len = min_t(unsigned, left, PAGE_SECTORS * 8); | ||
48 | |||
49 | bio_reset(bio); | ||
50 | bio->bi_sector = bucket + offset; | ||
51 | bio->bi_bdev = ca->bdev; | ||
52 | bio->bi_rw = READ; | ||
53 | bio->bi_size = len << 9; | ||
54 | |||
55 | bio->bi_end_io = journal_read_endio; | ||
56 | bio->bi_private = &op->cl; | ||
57 | bch_bio_map(bio, data); | ||
58 | |||
59 | closure_bio_submit(bio, &op->cl, ca); | ||
60 | closure_sync(&op->cl); | ||
61 | |||
62 | /* This function could be simpler now since we no longer write | ||
63 | * journal entries that overlap bucket boundaries; this means | ||
64 | * the start of a bucket will always have a valid journal entry | ||
65 | * if it has any journal entries at all. | ||
66 | */ | ||
67 | |||
68 | j = data; | ||
69 | while (len) { | ||
70 | struct list_head *where; | ||
71 | size_t blocks, bytes = set_bytes(j); | ||
72 | |||
73 | if (j->magic != jset_magic(ca->set)) | ||
74 | return ret; | ||
75 | |||
76 | if (bytes > left << 9) | ||
77 | return ret; | ||
78 | |||
79 | if (bytes > len << 9) | ||
80 | goto reread; | ||
81 | |||
82 | if (j->csum != csum_set(j)) | ||
83 | return ret; | ||
84 | |||
85 | blocks = set_blocks(j, ca->set); | ||
86 | |||
87 | while (!list_empty(list)) { | ||
88 | i = list_first_entry(list, | ||
89 | struct journal_replay, list); | ||
90 | if (i->j.seq >= j->last_seq) | ||
91 | break; | ||
92 | list_del(&i->list); | ||
93 | kfree(i); | ||
94 | } | ||
95 | |||
96 | list_for_each_entry_reverse(i, list, list) { | ||
97 | if (j->seq == i->j.seq) | ||
98 | goto next_set; | ||
99 | |||
100 | if (j->seq < i->j.last_seq) | ||
101 | goto next_set; | ||
102 | |||
103 | if (j->seq > i->j.seq) { | ||
104 | where = &i->list; | ||
105 | goto add; | ||
106 | } | ||
107 | } | ||
108 | |||
109 | where = list; | ||
110 | add: | ||
111 | i = kmalloc(offsetof(struct journal_replay, j) + | ||
112 | bytes, GFP_KERNEL); | ||
113 | if (!i) | ||
114 | return -ENOMEM; | ||
115 | memcpy(&i->j, j, bytes); | ||
116 | list_add(&i->list, where); | ||
117 | ret = 1; | ||
118 | |||
119 | ja->seq[bucket_index] = j->seq; | ||
120 | next_set: | ||
121 | offset += blocks * ca->sb.block_size; | ||
122 | len -= blocks * ca->sb.block_size; | ||
123 | j = ((void *) j) + blocks * block_bytes(ca); | ||
124 | } | ||
125 | } | ||
126 | |||
127 | return ret; | ||
128 | } | ||
129 | |||
130 | int bch_journal_read(struct cache_set *c, struct list_head *list, | ||
131 | struct btree_op *op) | ||
132 | { | ||
133 | #define read_bucket(b) \ | ||
134 | ({ \ | ||
135 | int ret = journal_read_bucket(ca, list, op, b); \ | ||
136 | __set_bit(b, bitmap); \ | ||
137 | if (ret < 0) \ | ||
138 | return ret; \ | ||
139 | ret; \ | ||
140 | }) | ||
141 | |||
142 | struct cache *ca; | ||
143 | unsigned iter; | ||
144 | |||
145 | for_each_cache(ca, c, iter) { | ||
146 | struct journal_device *ja = &ca->journal; | ||
147 | unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG]; | ||
148 | unsigned i, l, r, m; | ||
149 | uint64_t seq; | ||
150 | |||
151 | bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); | ||
152 | pr_debug("%u journal buckets", ca->sb.njournal_buckets); | ||
153 | |||
154 | /* Read journal buckets ordered by golden ratio hash to quickly | ||
155 | * find a sequence of buckets with valid journal entries | ||
156 | */ | ||
157 | for (i = 0; i < ca->sb.njournal_buckets; i++) { | ||
158 | l = (i * 2654435769U) % ca->sb.njournal_buckets; | ||
159 | |||
160 | if (test_bit(l, bitmap)) | ||
161 | break; | ||
162 | |||
163 | if (read_bucket(l)) | ||
164 | goto bsearch; | ||
165 | } | ||
166 | |||
167 | /* If that fails, check all the buckets we haven't checked | ||
168 | * already | ||
169 | */ | ||
170 | pr_debug("falling back to linear search"); | ||
171 | |||
172 | for (l = 0; l < ca->sb.njournal_buckets; l++) { | ||
173 | if (test_bit(l, bitmap)) | ||
174 | continue; | ||
175 | |||
176 | if (read_bucket(l)) | ||
177 | goto bsearch; | ||
178 | } | ||
179 | bsearch: | ||
180 | /* Binary search */ | ||
181 | m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); | ||
182 | pr_debug("starting binary search, l %u r %u", l, r); | ||
183 | |||
184 | while (l + 1 < r) { | ||
185 | m = (l + r) >> 1; | ||
186 | |||
187 | if (read_bucket(m)) | ||
188 | l = m; | ||
189 | else | ||
190 | r = m; | ||
191 | } | ||
192 | |||
193 | /* Read buckets in reverse order until we stop finding more | ||
194 | * journal entries | ||
195 | */ | ||
196 | pr_debug("finishing up"); | ||
197 | l = m; | ||
198 | |||
199 | while (1) { | ||
200 | if (!l--) | ||
201 | l = ca->sb.njournal_buckets - 1; | ||
202 | |||
203 | if (l == m) | ||
204 | break; | ||
205 | |||
206 | if (test_bit(l, bitmap)) | ||
207 | continue; | ||
208 | |||
209 | if (!read_bucket(l)) | ||
210 | break; | ||
211 | } | ||
212 | |||
213 | seq = 0; | ||
214 | |||
215 | for (i = 0; i < ca->sb.njournal_buckets; i++) | ||
216 | if (ja->seq[i] > seq) { | ||
217 | seq = ja->seq[i]; | ||
218 | ja->cur_idx = ja->discard_idx = | ||
219 | ja->last_idx = i; | ||
220 | |||
221 | } | ||
222 | } | ||
223 | |||
224 | c->journal.seq = list_entry(list->prev, | ||
225 | struct journal_replay, | ||
226 | list)->j.seq; | ||
227 | |||
228 | return 0; | ||
229 | #undef read_bucket | ||
230 | } | ||
231 | |||
232 | void bch_journal_mark(struct cache_set *c, struct list_head *list) | ||
233 | { | ||
234 | atomic_t p = { 0 }; | ||
235 | struct bkey *k; | ||
236 | struct journal_replay *i; | ||
237 | struct journal *j = &c->journal; | ||
238 | uint64_t last = j->seq; | ||
239 | |||
240 | /* | ||
241 | * journal.pin should never fill up - we never write a journal | ||
242 | * entry when it would fill up. But if for some reason it does, we | ||
243 | * iterate over the list in reverse order so that we can just skip that | ||
244 | * refcount instead of bugging. | ||
245 | */ | ||
246 | |||
247 | list_for_each_entry_reverse(i, list, list) { | ||
248 | BUG_ON(last < i->j.seq); | ||
249 | i->pin = NULL; | ||
250 | |||
251 | while (last-- != i->j.seq) | ||
252 | if (fifo_free(&j->pin) > 1) { | ||
253 | fifo_push_front(&j->pin, p); | ||
254 | atomic_set(&fifo_front(&j->pin), 0); | ||
255 | } | ||
256 | |||
257 | if (fifo_free(&j->pin) > 1) { | ||
258 | fifo_push_front(&j->pin, p); | ||
259 | i->pin = &fifo_front(&j->pin); | ||
260 | atomic_set(i->pin, 1); | ||
261 | } | ||
262 | |||
263 | for (k = i->j.start; | ||
264 | k < end(&i->j); | ||
265 | k = bkey_next(k)) { | ||
266 | unsigned j; | ||
267 | |||
268 | for (j = 0; j < KEY_PTRS(k); j++) { | ||
269 | struct bucket *g = PTR_BUCKET(c, k, j); | ||
270 | atomic_inc(&g->pin); | ||
271 | |||
272 | if (g->prio == BTREE_PRIO && | ||
273 | !ptr_stale(c, k, j)) | ||
274 | g->prio = INITIAL_PRIO; | ||
275 | } | ||
276 | |||
277 | __bch_btree_mark_key(c, 0, k); | ||
278 | } | ||
279 | } | ||
280 | } | ||
281 | |||
282 | int bch_journal_replay(struct cache_set *s, struct list_head *list, | ||
283 | struct btree_op *op) | ||
284 | { | ||
285 | int ret = 0, keys = 0, entries = 0; | ||
286 | struct bkey *k; | ||
287 | struct journal_replay *i = | ||
288 | list_entry(list->prev, struct journal_replay, list); | ||
289 | |||
290 | uint64_t start = i->j.last_seq, end = i->j.seq, n = start; | ||
291 | |||
292 | list_for_each_entry(i, list, list) { | ||
293 | BUG_ON(i->pin && atomic_read(i->pin) != 1); | ||
294 | |||
295 | if (n != i->j.seq) | ||
296 | pr_err( | ||
297 | "journal entries %llu-%llu missing! (replaying %llu-%llu)\n", | ||
298 | n, i->j.seq - 1, start, end); | ||
299 | |||
300 | for (k = i->j.start; | ||
301 | k < end(&i->j); | ||
302 | k = bkey_next(k)) { | ||
303 | pr_debug("%s", pkey(k)); | ||
304 | bkey_copy(op->keys.top, k); | ||
305 | bch_keylist_push(&op->keys); | ||
306 | |||
307 | op->journal = i->pin; | ||
308 | atomic_inc(op->journal); | ||
309 | |||
310 | ret = bch_btree_insert(op, s); | ||
311 | if (ret) | ||
312 | goto err; | ||
313 | |||
314 | BUG_ON(!bch_keylist_empty(&op->keys)); | ||
315 | keys++; | ||
316 | |||
317 | cond_resched(); | ||
318 | } | ||
319 | |||
320 | if (i->pin) | ||
321 | atomic_dec(i->pin); | ||
322 | n = i->j.seq + 1; | ||
323 | entries++; | ||
324 | } | ||
325 | |||
326 | pr_info("journal replay done, %i keys in %i entries, seq %llu", | ||
327 | keys, entries, end); | ||
328 | |||
329 | while (!list_empty(list)) { | ||
330 | i = list_first_entry(list, struct journal_replay, list); | ||
331 | list_del(&i->list); | ||
332 | kfree(i); | ||
333 | } | ||
334 | err: | ||
335 | closure_sync(&op->cl); | ||
336 | return ret; | ||
337 | } | ||
338 | |||
339 | /* Journalling */ | ||
340 | |||
341 | static void btree_flush_write(struct cache_set *c) | ||
342 | { | ||
343 | /* | ||
344 | * Try to find the btree node with that references the oldest journal | ||
345 | * entry, best is our current candidate and is locked if non NULL: | ||
346 | */ | ||
347 | struct btree *b, *best = NULL; | ||
348 | unsigned iter; | ||
349 | |||
350 | for_each_cached_btree(b, c, iter) { | ||
351 | if (!down_write_trylock(&b->lock)) | ||
352 | continue; | ||
353 | |||
354 | if (!btree_node_dirty(b) || | ||
355 | !btree_current_write(b)->journal) { | ||
356 | rw_unlock(true, b); | ||
357 | continue; | ||
358 | } | ||
359 | |||
360 | if (!best) | ||
361 | best = b; | ||
362 | else if (journal_pin_cmp(c, | ||
363 | btree_current_write(best), | ||
364 | btree_current_write(b))) { | ||
365 | rw_unlock(true, best); | ||
366 | best = b; | ||
367 | } else | ||
368 | rw_unlock(true, b); | ||
369 | } | ||
370 | |||
371 | if (best) | ||
372 | goto out; | ||
373 | |||
374 | /* We can't find the best btree node, just pick the first */ | ||
375 | list_for_each_entry(b, &c->btree_cache, list) | ||
376 | if (!b->level && btree_node_dirty(b)) { | ||
377 | best = b; | ||
378 | rw_lock(true, best, best->level); | ||
379 | goto found; | ||
380 | } | ||
381 | |||
382 | out: | ||
383 | if (!best) | ||
384 | return; | ||
385 | found: | ||
386 | if (btree_node_dirty(best)) | ||
387 | bch_btree_write(best, true, NULL); | ||
388 | rw_unlock(true, best); | ||
389 | } | ||
390 | |||
391 | #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) | ||
392 | |||
393 | static void journal_discard_endio(struct bio *bio, int error) | ||
394 | { | ||
395 | struct journal_device *ja = | ||
396 | container_of(bio, struct journal_device, discard_bio); | ||
397 | struct cache *ca = container_of(ja, struct cache, journal); | ||
398 | |||
399 | atomic_set(&ja->discard_in_flight, DISCARD_DONE); | ||
400 | |||
401 | closure_wake_up(&ca->set->journal.wait); | ||
402 | closure_put(&ca->set->cl); | ||
403 | } | ||
404 | |||
405 | static void journal_discard_work(struct work_struct *work) | ||
406 | { | ||
407 | struct journal_device *ja = | ||
408 | container_of(work, struct journal_device, discard_work); | ||
409 | |||
410 | submit_bio(0, &ja->discard_bio); | ||
411 | } | ||
412 | |||
413 | static void do_journal_discard(struct cache *ca) | ||
414 | { | ||
415 | struct journal_device *ja = &ca->journal; | ||
416 | struct bio *bio = &ja->discard_bio; | ||
417 | |||
418 | if (!ca->discard) { | ||
419 | ja->discard_idx = ja->last_idx; | ||
420 | return; | ||
421 | } | ||
422 | |||
423 | switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) { | ||
424 | case DISCARD_IN_FLIGHT: | ||
425 | return; | ||
426 | |||
427 | case DISCARD_DONE: | ||
428 | ja->discard_idx = (ja->discard_idx + 1) % | ||
429 | ca->sb.njournal_buckets; | ||
430 | |||
431 | atomic_set(&ja->discard_in_flight, DISCARD_READY); | ||
432 | /* fallthrough */ | ||
433 | |||
434 | case DISCARD_READY: | ||
435 | if (ja->discard_idx == ja->last_idx) | ||
436 | return; | ||
437 | |||
438 | atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); | ||
439 | |||
440 | bio_init(bio); | ||
441 | bio->bi_sector = bucket_to_sector(ca->set, | ||
442 | ca->sb.d[ja->discard_idx]); | ||
443 | bio->bi_bdev = ca->bdev; | ||
444 | bio->bi_rw = REQ_WRITE|REQ_DISCARD; | ||
445 | bio->bi_max_vecs = 1; | ||
446 | bio->bi_io_vec = bio->bi_inline_vecs; | ||
447 | bio->bi_size = bucket_bytes(ca); | ||
448 | bio->bi_end_io = journal_discard_endio; | ||
449 | |||
450 | closure_get(&ca->set->cl); | ||
451 | INIT_WORK(&ja->discard_work, journal_discard_work); | ||
452 | schedule_work(&ja->discard_work); | ||
453 | } | ||
454 | } | ||
455 | |||
456 | static void journal_reclaim(struct cache_set *c) | ||
457 | { | ||
458 | struct bkey *k = &c->journal.key; | ||
459 | struct cache *ca; | ||
460 | uint64_t last_seq; | ||
461 | unsigned iter, n = 0; | ||
462 | atomic_t p; | ||
463 | |||
464 | while (!atomic_read(&fifo_front(&c->journal.pin))) | ||
465 | fifo_pop(&c->journal.pin, p); | ||
466 | |||
467 | last_seq = last_seq(&c->journal); | ||
468 | |||
469 | /* Update last_idx */ | ||
470 | |||
471 | for_each_cache(ca, c, iter) { | ||
472 | struct journal_device *ja = &ca->journal; | ||
473 | |||
474 | while (ja->last_idx != ja->cur_idx && | ||
475 | ja->seq[ja->last_idx] < last_seq) | ||
476 | ja->last_idx = (ja->last_idx + 1) % | ||
477 | ca->sb.njournal_buckets; | ||
478 | } | ||
479 | |||
480 | for_each_cache(ca, c, iter) | ||
481 | do_journal_discard(ca); | ||
482 | |||
483 | if (c->journal.blocks_free) | ||
484 | return; | ||
485 | |||
486 | /* | ||
487 | * Allocate: | ||
488 | * XXX: Sort by free journal space | ||
489 | */ | ||
490 | |||
491 | for_each_cache(ca, c, iter) { | ||
492 | struct journal_device *ja = &ca->journal; | ||
493 | unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; | ||
494 | |||
495 | /* No space available on this device */ | ||
496 | if (next == ja->discard_idx) | ||
497 | continue; | ||
498 | |||
499 | ja->cur_idx = next; | ||
500 | k->ptr[n++] = PTR(0, | ||
501 | bucket_to_sector(c, ca->sb.d[ja->cur_idx]), | ||
502 | ca->sb.nr_this_dev); | ||
503 | } | ||
504 | |||
505 | bkey_init(k); | ||
506 | SET_KEY_PTRS(k, n); | ||
507 | |||
508 | if (n) | ||
509 | c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; | ||
510 | |||
511 | if (!journal_full(&c->journal)) | ||
512 | __closure_wake_up(&c->journal.wait); | ||
513 | } | ||
514 | |||
515 | void bch_journal_next(struct journal *j) | ||
516 | { | ||
517 | atomic_t p = { 1 }; | ||
518 | |||
519 | j->cur = (j->cur == j->w) | ||
520 | ? &j->w[1] | ||
521 | : &j->w[0]; | ||
522 | |||
523 | /* | ||
524 | * The fifo_push() needs to happen at the same time as j->seq is | ||
525 | * incremented for last_seq() to be calculated correctly | ||
526 | */ | ||
527 | BUG_ON(!fifo_push(&j->pin, p)); | ||
528 | atomic_set(&fifo_back(&j->pin), 1); | ||
529 | |||
530 | j->cur->data->seq = ++j->seq; | ||
531 | j->cur->need_write = false; | ||
532 | j->cur->data->keys = 0; | ||
533 | |||
534 | if (fifo_full(&j->pin)) | ||
535 | pr_debug("journal_pin full (%zu)", fifo_used(&j->pin)); | ||
536 | } | ||
537 | |||
538 | static void journal_write_endio(struct bio *bio, int error) | ||
539 | { | ||
540 | struct journal_write *w = bio->bi_private; | ||
541 | |||
542 | cache_set_err_on(error, w->c, "journal io error"); | ||
543 | closure_put(&w->c->journal.io.cl); | ||
544 | } | ||
545 | |||
546 | static void journal_write(struct closure *); | ||
547 | |||
548 | static void journal_write_done(struct closure *cl) | ||
549 | { | ||
550 | struct journal *j = container_of(cl, struct journal, io.cl); | ||
551 | struct cache_set *c = container_of(j, struct cache_set, journal); | ||
552 | |||
553 | struct journal_write *w = (j->cur == j->w) | ||
554 | ? &j->w[1] | ||
555 | : &j->w[0]; | ||
556 | |||
557 | __closure_wake_up(&w->wait); | ||
558 | |||
559 | if (c->journal_delay_ms) | ||
560 | closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms)); | ||
561 | |||
562 | continue_at(cl, journal_write, system_wq); | ||
563 | } | ||
564 | |||
565 | static void journal_write_unlocked(struct closure *cl) | ||
566 | __releases(c->journal.lock) | ||
567 | { | ||
568 | struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); | ||
569 | struct cache *ca; | ||
570 | struct journal_write *w = c->journal.cur; | ||
571 | struct bkey *k = &c->journal.key; | ||
572 | unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size; | ||
573 | |||
574 | struct bio *bio; | ||
575 | struct bio_list list; | ||
576 | bio_list_init(&list); | ||
577 | |||
578 | if (!w->need_write) { | ||
579 | /* | ||
580 | * XXX: have to unlock closure before we unlock journal lock, | ||
581 | * else we race with bch_journal(). But this way we race | ||
582 | * against cache set unregister. Doh. | ||
583 | */ | ||
584 | set_closure_fn(cl, NULL, NULL); | ||
585 | closure_sub(cl, CLOSURE_RUNNING + 1); | ||
586 | spin_unlock(&c->journal.lock); | ||
587 | return; | ||
588 | } else if (journal_full(&c->journal)) { | ||
589 | journal_reclaim(c); | ||
590 | spin_unlock(&c->journal.lock); | ||
591 | |||
592 | btree_flush_write(c); | ||
593 | continue_at(cl, journal_write, system_wq); | ||
594 | } | ||
595 | |||
596 | c->journal.blocks_free -= set_blocks(w->data, c); | ||
597 | |||
598 | w->data->btree_level = c->root->level; | ||
599 | |||
600 | bkey_copy(&w->data->btree_root, &c->root->key); | ||
601 | bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); | ||
602 | |||
603 | for_each_cache(ca, c, i) | ||
604 | w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; | ||
605 | |||
606 | w->data->magic = jset_magic(c); | ||
607 | w->data->version = BCACHE_JSET_VERSION; | ||
608 | w->data->last_seq = last_seq(&c->journal); | ||
609 | w->data->csum = csum_set(w->data); | ||
610 | |||
611 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
612 | ca = PTR_CACHE(c, k, i); | ||
613 | bio = &ca->journal.bio; | ||
614 | |||
615 | atomic_long_add(sectors, &ca->meta_sectors_written); | ||
616 | |||
617 | bio_reset(bio); | ||
618 | bio->bi_sector = PTR_OFFSET(k, i); | ||
619 | bio->bi_bdev = ca->bdev; | ||
620 | bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH; | ||
621 | bio->bi_size = sectors << 9; | ||
622 | |||
623 | bio->bi_end_io = journal_write_endio; | ||
624 | bio->bi_private = w; | ||
625 | bch_bio_map(bio, w->data); | ||
626 | |||
627 | trace_bcache_journal_write(bio); | ||
628 | bio_list_add(&list, bio); | ||
629 | |||
630 | SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors); | ||
631 | |||
632 | ca->journal.seq[ca->journal.cur_idx] = w->data->seq; | ||
633 | } | ||
634 | |||
635 | atomic_dec_bug(&fifo_back(&c->journal.pin)); | ||
636 | bch_journal_next(&c->journal); | ||
637 | journal_reclaim(c); | ||
638 | |||
639 | spin_unlock(&c->journal.lock); | ||
640 | |||
641 | while ((bio = bio_list_pop(&list))) | ||
642 | closure_bio_submit(bio, cl, c->cache[0]); | ||
643 | |||
644 | continue_at(cl, journal_write_done, NULL); | ||
645 | } | ||
646 | |||
647 | static void journal_write(struct closure *cl) | ||
648 | { | ||
649 | struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); | ||
650 | |||
651 | spin_lock(&c->journal.lock); | ||
652 | journal_write_unlocked(cl); | ||
653 | } | ||
654 | |||
655 | static void __journal_try_write(struct cache_set *c, bool noflush) | ||
656 | __releases(c->journal.lock) | ||
657 | { | ||
658 | struct closure *cl = &c->journal.io.cl; | ||
659 | |||
660 | if (!closure_trylock(cl, &c->cl)) | ||
661 | spin_unlock(&c->journal.lock); | ||
662 | else if (noflush && journal_full(&c->journal)) { | ||
663 | spin_unlock(&c->journal.lock); | ||
664 | continue_at(cl, journal_write, system_wq); | ||
665 | } else | ||
666 | journal_write_unlocked(cl); | ||
667 | } | ||
668 | |||
669 | #define journal_try_write(c) __journal_try_write(c, false) | ||
670 | |||
671 | void bch_journal_meta(struct cache_set *c, struct closure *cl) | ||
672 | { | ||
673 | struct journal_write *w; | ||
674 | |||
675 | if (CACHE_SYNC(&c->sb)) { | ||
676 | spin_lock(&c->journal.lock); | ||
677 | |||
678 | w = c->journal.cur; | ||
679 | w->need_write = true; | ||
680 | |||
681 | if (cl) | ||
682 | BUG_ON(!closure_wait(&w->wait, cl)); | ||
683 | |||
684 | __journal_try_write(c, true); | ||
685 | } | ||
686 | } | ||
687 | |||
688 | /* | ||
689 | * Entry point to the journalling code - bio_insert() and btree_invalidate() | ||
690 | * pass bch_journal() a list of keys to be journalled, and then | ||
691 | * bch_journal() hands those same keys off to btree_insert_async() | ||
692 | */ | ||
693 | |||
694 | void bch_journal(struct closure *cl) | ||
695 | { | ||
696 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
697 | struct cache_set *c = op->c; | ||
698 | struct journal_write *w; | ||
699 | size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list; | ||
700 | |||
701 | if (op->type != BTREE_INSERT || | ||
702 | !CACHE_SYNC(&c->sb)) | ||
703 | goto out; | ||
704 | |||
705 | /* | ||
706 | * If we're looping because we errored, might already be waiting on | ||
707 | * another journal write: | ||
708 | */ | ||
709 | while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING) | ||
710 | closure_sync(cl->parent); | ||
711 | |||
712 | spin_lock(&c->journal.lock); | ||
713 | |||
714 | if (journal_full(&c->journal)) { | ||
715 | /* XXX: tracepoint */ | ||
716 | closure_wait(&c->journal.wait, cl); | ||
717 | |||
718 | journal_reclaim(c); | ||
719 | spin_unlock(&c->journal.lock); | ||
720 | |||
721 | btree_flush_write(c); | ||
722 | continue_at(cl, bch_journal, bcache_wq); | ||
723 | } | ||
724 | |||
725 | w = c->journal.cur; | ||
726 | w->need_write = true; | ||
727 | b = __set_blocks(w->data, w->data->keys + n, c); | ||
728 | |||
729 | if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || | ||
730 | b > c->journal.blocks_free) { | ||
731 | /* XXX: If we were inserting so many keys that they won't fit in | ||
732 | * an _empty_ journal write, we'll deadlock. For now, handle | ||
733 | * this in bch_keylist_realloc() - but something to think about. | ||
734 | */ | ||
735 | BUG_ON(!w->data->keys); | ||
736 | |||
737 | /* XXX: tracepoint */ | ||
738 | BUG_ON(!closure_wait(&w->wait, cl)); | ||
739 | |||
740 | closure_flush(&c->journal.io); | ||
741 | |||
742 | journal_try_write(c); | ||
743 | continue_at(cl, bch_journal, bcache_wq); | ||
744 | } | ||
745 | |||
746 | memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t)); | ||
747 | w->data->keys += n; | ||
748 | |||
749 | op->journal = &fifo_back(&c->journal.pin); | ||
750 | atomic_inc(op->journal); | ||
751 | |||
752 | if (op->flush_journal) { | ||
753 | closure_flush(&c->journal.io); | ||
754 | closure_wait(&w->wait, cl->parent); | ||
755 | } | ||
756 | |||
757 | journal_try_write(c); | ||
758 | out: | ||
759 | bch_btree_insert_async(cl); | ||
760 | } | ||
761 | |||
762 | void bch_journal_free(struct cache_set *c) | ||
763 | { | ||
764 | free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); | ||
765 | free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); | ||
766 | free_fifo(&c->journal.pin); | ||
767 | } | ||
768 | |||
769 | int bch_journal_alloc(struct cache_set *c) | ||
770 | { | ||
771 | struct journal *j = &c->journal; | ||
772 | |||
773 | closure_init_unlocked(&j->io); | ||
774 | spin_lock_init(&j->lock); | ||
775 | |||
776 | c->journal_delay_ms = 100; | ||
777 | |||
778 | j->w[0].c = c; | ||
779 | j->w[1].c = c; | ||
780 | |||
781 | if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || | ||
782 | !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || | ||
783 | !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) | ||
784 | return -ENOMEM; | ||
785 | |||
786 | return 0; | ||
787 | } | ||
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h new file mode 100644 index 000000000000..3d7851274b04 --- /dev/null +++ b/drivers/md/bcache/journal.h | |||
@@ -0,0 +1,215 @@ | |||
1 | #ifndef _BCACHE_JOURNAL_H | ||
2 | #define _BCACHE_JOURNAL_H | ||
3 | |||
4 | /* | ||
5 | * THE JOURNAL: | ||
6 | * | ||
7 | * The journal is treated as a circular buffer of buckets - a journal entry | ||
8 | * never spans two buckets. This means (not implemented yet) we can resize the | ||
9 | * journal at runtime, and will be needed for bcache on raw flash support. | ||
10 | * | ||
11 | * Journal entries contain a list of keys, ordered by the time they were | ||
12 | * inserted; thus journal replay just has to reinsert the keys. | ||
13 | * | ||
14 | * We also keep some things in the journal header that are logically part of the | ||
15 | * superblock - all the things that are frequently updated. This is for future | ||
16 | * bcache on raw flash support; the superblock (which will become another | ||
17 | * journal) can't be moved or wear leveled, so it contains just enough | ||
18 | * information to find the main journal, and the superblock only has to be | ||
19 | * rewritten when we want to move/wear level the main journal. | ||
20 | * | ||
21 | * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be | ||
22 | * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions | ||
23 | * from cache misses, which don't have to be journaled, and for writeback and | ||
24 | * moving gc we work around it by flushing the btree to disk before updating the | ||
25 | * gc information. But it is a potential issue with incremental garbage | ||
26 | * collection, and it's fragile. | ||
27 | * | ||
28 | * OPEN JOURNAL ENTRIES: | ||
29 | * | ||
30 | * Each journal entry contains, in the header, the sequence number of the last | ||
31 | * journal entry still open - i.e. that has keys that haven't been flushed to | ||
32 | * disk in the btree. | ||
33 | * | ||
34 | * We track this by maintaining a refcount for every open journal entry, in a | ||
35 | * fifo; each entry in the fifo corresponds to a particular journal | ||
36 | * entry/sequence number. When the refcount at the tail of the fifo goes to | ||
37 | * zero, we pop it off - thus, the size of the fifo tells us the number of open | ||
38 | * journal entries | ||
39 | * | ||
40 | * We take a refcount on a journal entry when we add some keys to a journal | ||
41 | * entry that we're going to insert (held by struct btree_op), and then when we | ||
42 | * insert those keys into the btree the btree write we're setting up takes a | ||
43 | * copy of that refcount (held by struct btree_write). That refcount is dropped | ||
44 | * when the btree write completes. | ||
45 | * | ||
46 | * A struct btree_write can only hold a refcount on a single journal entry, but | ||
47 | * might contain keys for many journal entries - we handle this by making sure | ||
48 | * it always has a refcount on the _oldest_ journal entry of all the journal | ||
49 | * entries it has keys for. | ||
50 | * | ||
51 | * JOURNAL RECLAIM: | ||
52 | * | ||
53 | * As mentioned previously, our fifo of refcounts tells us the number of open | ||
54 | * journal entries; from that and the current journal sequence number we compute | ||
55 | * last_seq - the oldest journal entry we still need. We write last_seq in each | ||
56 | * journal entry, and we also have to keep track of where it exists on disk so | ||
57 | * we don't overwrite it when we loop around the journal. | ||
58 | * | ||
59 | * To do that we track, for each journal bucket, the sequence number of the | ||
60 | * newest journal entry it contains - if we don't need that journal entry we | ||
61 | * don't need anything in that bucket anymore. From that we track the last | ||
62 | * journal bucket we still need; all this is tracked in struct journal_device | ||
63 | * and updated by journal_reclaim(). | ||
64 | * | ||
65 | * JOURNAL FILLING UP: | ||
66 | * | ||
67 | * There are two ways the journal could fill up; either we could run out of | ||
68 | * space to write to, or we could have too many open journal entries and run out | ||
69 | * of room in the fifo of refcounts. Since those refcounts are decremented | ||
70 | * without any locking we can't safely resize that fifo, so we handle it the | ||
71 | * same way. | ||
72 | * | ||
73 | * If the journal fills up, we start flushing dirty btree nodes until we can | ||
74 | * allocate space for a journal write again - preferentially flushing btree | ||
75 | * nodes that are pinning the oldest journal entries first. | ||
76 | */ | ||
77 | |||
78 | #define BCACHE_JSET_VERSION_UUIDv1 1 | ||
79 | /* Always latest UUID format */ | ||
80 | #define BCACHE_JSET_VERSION_UUID 1 | ||
81 | #define BCACHE_JSET_VERSION 1 | ||
82 | |||
83 | /* | ||
84 | * On disk format for a journal entry: | ||
85 | * seq is monotonically increasing; every journal entry has its own unique | ||
86 | * sequence number. | ||
87 | * | ||
88 | * last_seq is the oldest journal entry that still has keys the btree hasn't | ||
89 | * flushed to disk yet. | ||
90 | * | ||
91 | * version is for on disk format changes. | ||
92 | */ | ||
93 | struct jset { | ||
94 | uint64_t csum; | ||
95 | uint64_t magic; | ||
96 | uint64_t seq; | ||
97 | uint32_t version; | ||
98 | uint32_t keys; | ||
99 | |||
100 | uint64_t last_seq; | ||
101 | |||
102 | BKEY_PADDED(uuid_bucket); | ||
103 | BKEY_PADDED(btree_root); | ||
104 | uint16_t btree_level; | ||
105 | uint16_t pad[3]; | ||
106 | |||
107 | uint64_t prio_bucket[MAX_CACHES_PER_SET]; | ||
108 | |||
109 | union { | ||
110 | struct bkey start[0]; | ||
111 | uint64_t d[0]; | ||
112 | }; | ||
113 | }; | ||
114 | |||
115 | /* | ||
116 | * Only used for holding the journal entries we read in btree_journal_read() | ||
117 | * during cache_registration | ||
118 | */ | ||
119 | struct journal_replay { | ||
120 | struct list_head list; | ||
121 | atomic_t *pin; | ||
122 | struct jset j; | ||
123 | }; | ||
124 | |||
125 | /* | ||
126 | * We put two of these in struct journal; we used them for writes to the | ||
127 | * journal that are being staged or in flight. | ||
128 | */ | ||
129 | struct journal_write { | ||
130 | struct jset *data; | ||
131 | #define JSET_BITS 3 | ||
132 | |||
133 | struct cache_set *c; | ||
134 | struct closure_waitlist wait; | ||
135 | bool need_write; | ||
136 | }; | ||
137 | |||
138 | /* Embedded in struct cache_set */ | ||
139 | struct journal { | ||
140 | spinlock_t lock; | ||
141 | /* used when waiting because the journal was full */ | ||
142 | struct closure_waitlist wait; | ||
143 | struct closure_with_timer io; | ||
144 | |||
145 | /* Number of blocks free in the bucket(s) we're currently writing to */ | ||
146 | unsigned blocks_free; | ||
147 | uint64_t seq; | ||
148 | DECLARE_FIFO(atomic_t, pin); | ||
149 | |||
150 | BKEY_PADDED(key); | ||
151 | |||
152 | struct journal_write w[2], *cur; | ||
153 | }; | ||
154 | |||
155 | /* | ||
156 | * Embedded in struct cache. First three fields refer to the array of journal | ||
157 | * buckets, in cache_sb. | ||
158 | */ | ||
159 | struct journal_device { | ||
160 | /* | ||
161 | * For each journal bucket, contains the max sequence number of the | ||
162 | * journal writes it contains - so we know when a bucket can be reused. | ||
163 | */ | ||
164 | uint64_t seq[SB_JOURNAL_BUCKETS]; | ||
165 | |||
166 | /* Journal bucket we're currently writing to */ | ||
167 | unsigned cur_idx; | ||
168 | |||
169 | /* Last journal bucket that still contains an open journal entry */ | ||
170 | unsigned last_idx; | ||
171 | |||
172 | /* Next journal bucket to be discarded */ | ||
173 | unsigned discard_idx; | ||
174 | |||
175 | #define DISCARD_READY 0 | ||
176 | #define DISCARD_IN_FLIGHT 1 | ||
177 | #define DISCARD_DONE 2 | ||
178 | /* 1 - discard in flight, -1 - discard completed */ | ||
179 | atomic_t discard_in_flight; | ||
180 | |||
181 | struct work_struct discard_work; | ||
182 | struct bio discard_bio; | ||
183 | struct bio_vec discard_bv; | ||
184 | |||
185 | /* Bio for journal reads/writes to this device */ | ||
186 | struct bio bio; | ||
187 | struct bio_vec bv[8]; | ||
188 | }; | ||
189 | |||
190 | #define journal_pin_cmp(c, l, r) \ | ||
191 | (fifo_idx(&(c)->journal.pin, (l)->journal) > \ | ||
192 | fifo_idx(&(c)->journal.pin, (r)->journal)) | ||
193 | |||
194 | #define JOURNAL_PIN 20000 | ||
195 | |||
196 | #define journal_full(j) \ | ||
197 | (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1) | ||
198 | |||
199 | struct closure; | ||
200 | struct cache_set; | ||
201 | struct btree_op; | ||
202 | |||
203 | void bch_journal(struct closure *); | ||
204 | void bch_journal_next(struct journal *); | ||
205 | void bch_journal_mark(struct cache_set *, struct list_head *); | ||
206 | void bch_journal_meta(struct cache_set *, struct closure *); | ||
207 | int bch_journal_read(struct cache_set *, struct list_head *, | ||
208 | struct btree_op *); | ||
209 | int bch_journal_replay(struct cache_set *, struct list_head *, | ||
210 | struct btree_op *); | ||
211 | |||
212 | void bch_journal_free(struct cache_set *); | ||
213 | int bch_journal_alloc(struct cache_set *); | ||
214 | |||
215 | #endif /* _BCACHE_JOURNAL_H */ | ||
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c new file mode 100644 index 000000000000..8589512c972e --- /dev/null +++ b/drivers/md/bcache/movinggc.c | |||
@@ -0,0 +1,254 @@ | |||
1 | /* | ||
2 | * Moving/copying garbage collector | ||
3 | * | ||
4 | * Copyright 2012 Google, Inc. | ||
5 | */ | ||
6 | |||
7 | #include "bcache.h" | ||
8 | #include "btree.h" | ||
9 | #include "debug.h" | ||
10 | #include "request.h" | ||
11 | |||
12 | struct moving_io { | ||
13 | struct keybuf_key *w; | ||
14 | struct search s; | ||
15 | struct bbio bio; | ||
16 | }; | ||
17 | |||
18 | static bool moving_pred(struct keybuf *buf, struct bkey *k) | ||
19 | { | ||
20 | struct cache_set *c = container_of(buf, struct cache_set, | ||
21 | moving_gc_keys); | ||
22 | unsigned i; | ||
23 | |||
24 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
25 | struct cache *ca = PTR_CACHE(c, k, i); | ||
26 | struct bucket *g = PTR_BUCKET(c, k, i); | ||
27 | |||
28 | if (GC_SECTORS_USED(g) < ca->gc_move_threshold) | ||
29 | return true; | ||
30 | } | ||
31 | |||
32 | return false; | ||
33 | } | ||
34 | |||
35 | /* Moving GC - IO loop */ | ||
36 | |||
37 | static void moving_io_destructor(struct closure *cl) | ||
38 | { | ||
39 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); | ||
40 | kfree(io); | ||
41 | } | ||
42 | |||
43 | static void write_moving_finish(struct closure *cl) | ||
44 | { | ||
45 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); | ||
46 | struct bio *bio = &io->bio.bio; | ||
47 | struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt); | ||
48 | |||
49 | while (bv-- != bio->bi_io_vec) | ||
50 | __free_page(bv->bv_page); | ||
51 | |||
52 | pr_debug("%s %s", io->s.op.insert_collision | ||
53 | ? "collision moving" : "moved", | ||
54 | pkey(&io->w->key)); | ||
55 | |||
56 | bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); | ||
57 | |||
58 | atomic_dec_bug(&io->s.op.c->in_flight); | ||
59 | closure_wake_up(&io->s.op.c->moving_gc_wait); | ||
60 | |||
61 | closure_return_with_destructor(cl, moving_io_destructor); | ||
62 | } | ||
63 | |||
64 | static void read_moving_endio(struct bio *bio, int error) | ||
65 | { | ||
66 | struct moving_io *io = container_of(bio->bi_private, | ||
67 | struct moving_io, s.cl); | ||
68 | |||
69 | if (error) | ||
70 | io->s.error = error; | ||
71 | |||
72 | bch_bbio_endio(io->s.op.c, bio, error, "reading data to move"); | ||
73 | } | ||
74 | |||
75 | static void moving_init(struct moving_io *io) | ||
76 | { | ||
77 | struct bio *bio = &io->bio.bio; | ||
78 | |||
79 | bio_init(bio); | ||
80 | bio_get(bio); | ||
81 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | ||
82 | |||
83 | bio->bi_size = KEY_SIZE(&io->w->key) << 9; | ||
84 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), | ||
85 | PAGE_SECTORS); | ||
86 | bio->bi_private = &io->s.cl; | ||
87 | bio->bi_io_vec = bio->bi_inline_vecs; | ||
88 | bch_bio_map(bio, NULL); | ||
89 | } | ||
90 | |||
91 | static void write_moving(struct closure *cl) | ||
92 | { | ||
93 | struct search *s = container_of(cl, struct search, cl); | ||
94 | struct moving_io *io = container_of(s, struct moving_io, s); | ||
95 | |||
96 | if (!s->error) { | ||
97 | trace_bcache_write_moving(&io->bio.bio); | ||
98 | |||
99 | moving_init(io); | ||
100 | |||
101 | io->bio.bio.bi_sector = KEY_START(&io->w->key); | ||
102 | s->op.lock = -1; | ||
103 | s->op.write_prio = 1; | ||
104 | s->op.cache_bio = &io->bio.bio; | ||
105 | |||
106 | s->writeback = KEY_DIRTY(&io->w->key); | ||
107 | s->op.csum = KEY_CSUM(&io->w->key); | ||
108 | |||
109 | s->op.type = BTREE_REPLACE; | ||
110 | bkey_copy(&s->op.replace, &io->w->key); | ||
111 | |||
112 | closure_init(&s->op.cl, cl); | ||
113 | bch_insert_data(&s->op.cl); | ||
114 | } | ||
115 | |||
116 | continue_at(cl, write_moving_finish, NULL); | ||
117 | } | ||
118 | |||
119 | static void read_moving_submit(struct closure *cl) | ||
120 | { | ||
121 | struct search *s = container_of(cl, struct search, cl); | ||
122 | struct moving_io *io = container_of(s, struct moving_io, s); | ||
123 | struct bio *bio = &io->bio.bio; | ||
124 | |||
125 | trace_bcache_read_moving(bio); | ||
126 | bch_submit_bbio(bio, s->op.c, &io->w->key, 0); | ||
127 | |||
128 | continue_at(cl, write_moving, bch_gc_wq); | ||
129 | } | ||
130 | |||
131 | static void read_moving(struct closure *cl) | ||
132 | { | ||
133 | struct cache_set *c = container_of(cl, struct cache_set, moving_gc); | ||
134 | struct keybuf_key *w; | ||
135 | struct moving_io *io; | ||
136 | struct bio *bio; | ||
137 | |||
138 | /* XXX: if we error, background writeback could stall indefinitely */ | ||
139 | |||
140 | while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { | ||
141 | w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY); | ||
142 | if (!w) | ||
143 | break; | ||
144 | |||
145 | io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) | ||
146 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | ||
147 | GFP_KERNEL); | ||
148 | if (!io) | ||
149 | goto err; | ||
150 | |||
151 | w->private = io; | ||
152 | io->w = w; | ||
153 | io->s.op.inode = KEY_INODE(&w->key); | ||
154 | io->s.op.c = c; | ||
155 | |||
156 | moving_init(io); | ||
157 | bio = &io->bio.bio; | ||
158 | |||
159 | bio->bi_rw = READ; | ||
160 | bio->bi_end_io = read_moving_endio; | ||
161 | |||
162 | if (bch_bio_alloc_pages(bio, GFP_KERNEL)) | ||
163 | goto err; | ||
164 | |||
165 | pr_debug("%s", pkey(&w->key)); | ||
166 | |||
167 | closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); | ||
168 | |||
169 | if (atomic_inc_return(&c->in_flight) >= 64) { | ||
170 | closure_wait_event(&c->moving_gc_wait, cl, | ||
171 | atomic_read(&c->in_flight) < 64); | ||
172 | continue_at(cl, read_moving, bch_gc_wq); | ||
173 | } | ||
174 | } | ||
175 | |||
176 | if (0) { | ||
177 | err: if (!IS_ERR_OR_NULL(w->private)) | ||
178 | kfree(w->private); | ||
179 | |||
180 | bch_keybuf_del(&c->moving_gc_keys, w); | ||
181 | } | ||
182 | |||
183 | closure_return(cl); | ||
184 | } | ||
185 | |||
186 | static bool bucket_cmp(struct bucket *l, struct bucket *r) | ||
187 | { | ||
188 | return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); | ||
189 | } | ||
190 | |||
191 | static unsigned bucket_heap_top(struct cache *ca) | ||
192 | { | ||
193 | return GC_SECTORS_USED(heap_peek(&ca->heap)); | ||
194 | } | ||
195 | |||
196 | void bch_moving_gc(struct closure *cl) | ||
197 | { | ||
198 | struct cache_set *c = container_of(cl, struct cache_set, gc.cl); | ||
199 | struct cache *ca; | ||
200 | struct bucket *b; | ||
201 | unsigned i; | ||
202 | |||
203 | if (!c->copy_gc_enabled) | ||
204 | closure_return(cl); | ||
205 | |||
206 | mutex_lock(&c->bucket_lock); | ||
207 | |||
208 | for_each_cache(ca, c, i) { | ||
209 | unsigned sectors_to_move = 0; | ||
210 | unsigned reserve_sectors = ca->sb.bucket_size * | ||
211 | min(fifo_used(&ca->free), ca->free.size / 2); | ||
212 | |||
213 | ca->heap.used = 0; | ||
214 | |||
215 | for_each_bucket(b, ca) { | ||
216 | if (!GC_SECTORS_USED(b)) | ||
217 | continue; | ||
218 | |||
219 | if (!heap_full(&ca->heap)) { | ||
220 | sectors_to_move += GC_SECTORS_USED(b); | ||
221 | heap_add(&ca->heap, b, bucket_cmp); | ||
222 | } else if (bucket_cmp(b, heap_peek(&ca->heap))) { | ||
223 | sectors_to_move -= bucket_heap_top(ca); | ||
224 | sectors_to_move += GC_SECTORS_USED(b); | ||
225 | |||
226 | ca->heap.data[0] = b; | ||
227 | heap_sift(&ca->heap, 0, bucket_cmp); | ||
228 | } | ||
229 | } | ||
230 | |||
231 | while (sectors_to_move > reserve_sectors) { | ||
232 | heap_pop(&ca->heap, b, bucket_cmp); | ||
233 | sectors_to_move -= GC_SECTORS_USED(b); | ||
234 | } | ||
235 | |||
236 | ca->gc_move_threshold = bucket_heap_top(ca); | ||
237 | |||
238 | pr_debug("threshold %u", ca->gc_move_threshold); | ||
239 | } | ||
240 | |||
241 | mutex_unlock(&c->bucket_lock); | ||
242 | |||
243 | c->moving_gc_keys.last_scanned = ZERO_KEY; | ||
244 | |||
245 | closure_init(&c->moving_gc, cl); | ||
246 | read_moving(&c->moving_gc); | ||
247 | |||
248 | closure_return(cl); | ||
249 | } | ||
250 | |||
251 | void bch_moving_init_cache_set(struct cache_set *c) | ||
252 | { | ||
253 | bch_keybuf_init(&c->moving_gc_keys, moving_pred); | ||
254 | } | ||
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c new file mode 100644 index 000000000000..e5ff12e52d5b --- /dev/null +++ b/drivers/md/bcache/request.c | |||
@@ -0,0 +1,1411 @@ | |||
1 | /* | ||
2 | * Main bcache entry point - handle a read or a write request and decide what to | ||
3 | * do with it; the make_request functions are called by the block layer. | ||
4 | * | ||
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
6 | * Copyright 2012 Google, Inc. | ||
7 | */ | ||
8 | |||
9 | #include "bcache.h" | ||
10 | #include "btree.h" | ||
11 | #include "debug.h" | ||
12 | #include "request.h" | ||
13 | |||
14 | #include <linux/cgroup.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/hash.h> | ||
17 | #include <linux/random.h> | ||
18 | #include "blk-cgroup.h" | ||
19 | |||
20 | #include <trace/events/bcache.h> | ||
21 | |||
22 | #define CUTOFF_CACHE_ADD 95 | ||
23 | #define CUTOFF_CACHE_READA 90 | ||
24 | #define CUTOFF_WRITEBACK 50 | ||
25 | #define CUTOFF_WRITEBACK_SYNC 75 | ||
26 | |||
27 | struct kmem_cache *bch_search_cache; | ||
28 | |||
29 | static void check_should_skip(struct cached_dev *, struct search *); | ||
30 | |||
31 | /* Cgroup interface */ | ||
32 | |||
33 | #ifdef CONFIG_CGROUP_BCACHE | ||
34 | static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 }; | ||
35 | |||
36 | static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup) | ||
37 | { | ||
38 | struct cgroup_subsys_state *css; | ||
39 | return cgroup && | ||
40 | (css = cgroup_subsys_state(cgroup, bcache_subsys_id)) | ||
41 | ? container_of(css, struct bch_cgroup, css) | ||
42 | : &bcache_default_cgroup; | ||
43 | } | ||
44 | |||
45 | struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio) | ||
46 | { | ||
47 | struct cgroup_subsys_state *css = bio->bi_css | ||
48 | ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id) | ||
49 | : task_subsys_state(current, bcache_subsys_id); | ||
50 | |||
51 | return css | ||
52 | ? container_of(css, struct bch_cgroup, css) | ||
53 | : &bcache_default_cgroup; | ||
54 | } | ||
55 | |||
56 | static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft, | ||
57 | struct file *file, | ||
58 | char __user *buf, size_t nbytes, loff_t *ppos) | ||
59 | { | ||
60 | char tmp[1024]; | ||
61 | int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes, | ||
62 | cgroup_to_bcache(cgrp)->cache_mode + 1); | ||
63 | |||
64 | if (len < 0) | ||
65 | return len; | ||
66 | |||
67 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | ||
68 | } | ||
69 | |||
70 | static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft, | ||
71 | const char *buf) | ||
72 | { | ||
73 | int v = bch_read_string_list(buf, bch_cache_modes); | ||
74 | if (v < 0) | ||
75 | return v; | ||
76 | |||
77 | cgroup_to_bcache(cgrp)->cache_mode = v - 1; | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft) | ||
82 | { | ||
83 | return cgroup_to_bcache(cgrp)->verify; | ||
84 | } | ||
85 | |||
86 | static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val) | ||
87 | { | ||
88 | cgroup_to_bcache(cgrp)->verify = val; | ||
89 | return 0; | ||
90 | } | ||
91 | |||
92 | static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft) | ||
93 | { | ||
94 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | ||
95 | return atomic_read(&bcachecg->stats.cache_hits); | ||
96 | } | ||
97 | |||
98 | static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft) | ||
99 | { | ||
100 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | ||
101 | return atomic_read(&bcachecg->stats.cache_misses); | ||
102 | } | ||
103 | |||
104 | static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp, | ||
105 | struct cftype *cft) | ||
106 | { | ||
107 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | ||
108 | return atomic_read(&bcachecg->stats.cache_bypass_hits); | ||
109 | } | ||
110 | |||
111 | static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp, | ||
112 | struct cftype *cft) | ||
113 | { | ||
114 | struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); | ||
115 | return atomic_read(&bcachecg->stats.cache_bypass_misses); | ||
116 | } | ||
117 | |||
118 | static struct cftype bch_files[] = { | ||
119 | { | ||
120 | .name = "cache_mode", | ||
121 | .read = cache_mode_read, | ||
122 | .write_string = cache_mode_write, | ||
123 | }, | ||
124 | { | ||
125 | .name = "verify", | ||
126 | .read_u64 = bch_verify_read, | ||
127 | .write_u64 = bch_verify_write, | ||
128 | }, | ||
129 | { | ||
130 | .name = "cache_hits", | ||
131 | .read_u64 = bch_cache_hits_read, | ||
132 | }, | ||
133 | { | ||
134 | .name = "cache_misses", | ||
135 | .read_u64 = bch_cache_misses_read, | ||
136 | }, | ||
137 | { | ||
138 | .name = "cache_bypass_hits", | ||
139 | .read_u64 = bch_cache_bypass_hits_read, | ||
140 | }, | ||
141 | { | ||
142 | .name = "cache_bypass_misses", | ||
143 | .read_u64 = bch_cache_bypass_misses_read, | ||
144 | }, | ||
145 | { } /* terminate */ | ||
146 | }; | ||
147 | |||
148 | static void init_bch_cgroup(struct bch_cgroup *cg) | ||
149 | { | ||
150 | cg->cache_mode = -1; | ||
151 | } | ||
152 | |||
153 | static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) | ||
154 | { | ||
155 | struct bch_cgroup *cg; | ||
156 | |||
157 | cg = kzalloc(sizeof(*cg), GFP_KERNEL); | ||
158 | if (!cg) | ||
159 | return ERR_PTR(-ENOMEM); | ||
160 | init_bch_cgroup(cg); | ||
161 | return &cg->css; | ||
162 | } | ||
163 | |||
164 | static void bcachecg_destroy(struct cgroup *cgroup) | ||
165 | { | ||
166 | struct bch_cgroup *cg = cgroup_to_bcache(cgroup); | ||
167 | free_css_id(&bcache_subsys, &cg->css); | ||
168 | kfree(cg); | ||
169 | } | ||
170 | |||
171 | struct cgroup_subsys bcache_subsys = { | ||
172 | .create = bcachecg_create, | ||
173 | .destroy = bcachecg_destroy, | ||
174 | .subsys_id = bcache_subsys_id, | ||
175 | .name = "bcache", | ||
176 | .module = THIS_MODULE, | ||
177 | }; | ||
178 | EXPORT_SYMBOL_GPL(bcache_subsys); | ||
179 | #endif | ||
180 | |||
181 | static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) | ||
182 | { | ||
183 | #ifdef CONFIG_CGROUP_BCACHE | ||
184 | int r = bch_bio_to_cgroup(bio)->cache_mode; | ||
185 | if (r >= 0) | ||
186 | return r; | ||
187 | #endif | ||
188 | return BDEV_CACHE_MODE(&dc->sb); | ||
189 | } | ||
190 | |||
191 | static bool verify(struct cached_dev *dc, struct bio *bio) | ||
192 | { | ||
193 | #ifdef CONFIG_CGROUP_BCACHE | ||
194 | if (bch_bio_to_cgroup(bio)->verify) | ||
195 | return true; | ||
196 | #endif | ||
197 | return dc->verify; | ||
198 | } | ||
199 | |||
200 | static void bio_csum(struct bio *bio, struct bkey *k) | ||
201 | { | ||
202 | struct bio_vec *bv; | ||
203 | uint64_t csum = 0; | ||
204 | int i; | ||
205 | |||
206 | bio_for_each_segment(bv, bio, i) { | ||
207 | void *d = kmap(bv->bv_page) + bv->bv_offset; | ||
208 | csum = bch_crc64_update(csum, d, bv->bv_len); | ||
209 | kunmap(bv->bv_page); | ||
210 | } | ||
211 | |||
212 | k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); | ||
213 | } | ||
214 | |||
215 | /* Insert data into cache */ | ||
216 | |||
217 | static void bio_invalidate(struct closure *cl) | ||
218 | { | ||
219 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
220 | struct bio *bio = op->cache_bio; | ||
221 | |||
222 | pr_debug("invalidating %i sectors from %llu", | ||
223 | bio_sectors(bio), (uint64_t) bio->bi_sector); | ||
224 | |||
225 | while (bio_sectors(bio)) { | ||
226 | unsigned len = min(bio_sectors(bio), 1U << 14); | ||
227 | |||
228 | if (bch_keylist_realloc(&op->keys, 0, op->c)) | ||
229 | goto out; | ||
230 | |||
231 | bio->bi_sector += len; | ||
232 | bio->bi_size -= len << 9; | ||
233 | |||
234 | bch_keylist_add(&op->keys, | ||
235 | &KEY(op->inode, bio->bi_sector, len)); | ||
236 | } | ||
237 | |||
238 | op->insert_data_done = true; | ||
239 | bio_put(bio); | ||
240 | out: | ||
241 | continue_at(cl, bch_journal, bcache_wq); | ||
242 | } | ||
243 | |||
244 | struct open_bucket { | ||
245 | struct list_head list; | ||
246 | struct task_struct *last; | ||
247 | unsigned sectors_free; | ||
248 | BKEY_PADDED(key); | ||
249 | }; | ||
250 | |||
251 | void bch_open_buckets_free(struct cache_set *c) | ||
252 | { | ||
253 | struct open_bucket *b; | ||
254 | |||
255 | while (!list_empty(&c->data_buckets)) { | ||
256 | b = list_first_entry(&c->data_buckets, | ||
257 | struct open_bucket, list); | ||
258 | list_del(&b->list); | ||
259 | kfree(b); | ||
260 | } | ||
261 | } | ||
262 | |||
263 | int bch_open_buckets_alloc(struct cache_set *c) | ||
264 | { | ||
265 | int i; | ||
266 | |||
267 | spin_lock_init(&c->data_bucket_lock); | ||
268 | |||
269 | for (i = 0; i < 6; i++) { | ||
270 | struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); | ||
271 | if (!b) | ||
272 | return -ENOMEM; | ||
273 | |||
274 | list_add(&b->list, &c->data_buckets); | ||
275 | } | ||
276 | |||
277 | return 0; | ||
278 | } | ||
279 | |||
280 | /* | ||
281 | * We keep multiple buckets open for writes, and try to segregate different | ||
282 | * write streams for better cache utilization: first we look for a bucket where | ||
283 | * the last write to it was sequential with the current write, and failing that | ||
284 | * we look for a bucket that was last used by the same task. | ||
285 | * | ||
286 | * The ideas is if you've got multiple tasks pulling data into the cache at the | ||
287 | * same time, you'll get better cache utilization if you try to segregate their | ||
288 | * data and preserve locality. | ||
289 | * | ||
290 | * For example, say you've starting Firefox at the same time you're copying a | ||
291 | * bunch of files. Firefox will likely end up being fairly hot and stay in the | ||
292 | * cache awhile, but the data you copied might not be; if you wrote all that | ||
293 | * data to the same buckets it'd get invalidated at the same time. | ||
294 | * | ||
295 | * Both of those tasks will be doing fairly random IO so we can't rely on | ||
296 | * detecting sequential IO to segregate their data, but going off of the task | ||
297 | * should be a sane heuristic. | ||
298 | */ | ||
299 | static struct open_bucket *pick_data_bucket(struct cache_set *c, | ||
300 | const struct bkey *search, | ||
301 | struct task_struct *task, | ||
302 | struct bkey *alloc) | ||
303 | { | ||
304 | struct open_bucket *ret, *ret_task = NULL; | ||
305 | |||
306 | list_for_each_entry_reverse(ret, &c->data_buckets, list) | ||
307 | if (!bkey_cmp(&ret->key, search)) | ||
308 | goto found; | ||
309 | else if (ret->last == task) | ||
310 | ret_task = ret; | ||
311 | |||
312 | ret = ret_task ?: list_first_entry(&c->data_buckets, | ||
313 | struct open_bucket, list); | ||
314 | found: | ||
315 | if (!ret->sectors_free && KEY_PTRS(alloc)) { | ||
316 | ret->sectors_free = c->sb.bucket_size; | ||
317 | bkey_copy(&ret->key, alloc); | ||
318 | bkey_init(alloc); | ||
319 | } | ||
320 | |||
321 | if (!ret->sectors_free) | ||
322 | ret = NULL; | ||
323 | |||
324 | return ret; | ||
325 | } | ||
326 | |||
327 | /* | ||
328 | * Allocates some space in the cache to write to, and k to point to the newly | ||
329 | * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the | ||
330 | * end of the newly allocated space). | ||
331 | * | ||
332 | * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many | ||
333 | * sectors were actually allocated. | ||
334 | * | ||
335 | * If s->writeback is true, will not fail. | ||
336 | */ | ||
337 | static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, | ||
338 | struct search *s) | ||
339 | { | ||
340 | struct cache_set *c = s->op.c; | ||
341 | struct open_bucket *b; | ||
342 | BKEY_PADDED(key) alloc; | ||
343 | struct closure cl, *w = NULL; | ||
344 | unsigned i; | ||
345 | |||
346 | if (s->writeback) { | ||
347 | closure_init_stack(&cl); | ||
348 | w = &cl; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * We might have to allocate a new bucket, which we can't do with a | ||
353 | * spinlock held. So if we have to allocate, we drop the lock, allocate | ||
354 | * and then retry. KEY_PTRS() indicates whether alloc points to | ||
355 | * allocated bucket(s). | ||
356 | */ | ||
357 | |||
358 | bkey_init(&alloc.key); | ||
359 | spin_lock(&c->data_bucket_lock); | ||
360 | |||
361 | while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) { | ||
362 | unsigned watermark = s->op.write_prio | ||
363 | ? WATERMARK_MOVINGGC | ||
364 | : WATERMARK_NONE; | ||
365 | |||
366 | spin_unlock(&c->data_bucket_lock); | ||
367 | |||
368 | if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w)) | ||
369 | return false; | ||
370 | |||
371 | spin_lock(&c->data_bucket_lock); | ||
372 | } | ||
373 | |||
374 | /* | ||
375 | * If we had to allocate, we might race and not need to allocate the | ||
376 | * second time we call find_data_bucket(). If we allocated a bucket but | ||
377 | * didn't use it, drop the refcount bch_bucket_alloc_set() took: | ||
378 | */ | ||
379 | if (KEY_PTRS(&alloc.key)) | ||
380 | __bkey_put(c, &alloc.key); | ||
381 | |||
382 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
383 | EBUG_ON(ptr_stale(c, &b->key, i)); | ||
384 | |||
385 | /* Set up the pointer to the space we're allocating: */ | ||
386 | |||
387 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
388 | k->ptr[i] = b->key.ptr[i]; | ||
389 | |||
390 | sectors = min(sectors, b->sectors_free); | ||
391 | |||
392 | SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); | ||
393 | SET_KEY_SIZE(k, sectors); | ||
394 | SET_KEY_PTRS(k, KEY_PTRS(&b->key)); | ||
395 | |||
396 | /* | ||
397 | * Move b to the end of the lru, and keep track of what this bucket was | ||
398 | * last used for: | ||
399 | */ | ||
400 | list_move_tail(&b->list, &c->data_buckets); | ||
401 | bkey_copy_key(&b->key, k); | ||
402 | b->last = s->task; | ||
403 | |||
404 | b->sectors_free -= sectors; | ||
405 | |||
406 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | ||
407 | SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); | ||
408 | |||
409 | atomic_long_add(sectors, | ||
410 | &PTR_CACHE(c, &b->key, i)->sectors_written); | ||
411 | } | ||
412 | |||
413 | if (b->sectors_free < c->sb.block_size) | ||
414 | b->sectors_free = 0; | ||
415 | |||
416 | /* | ||
417 | * k takes refcounts on the buckets it points to until it's inserted | ||
418 | * into the btree, but if we're done with this bucket we just transfer | ||
419 | * get_data_bucket()'s refcount. | ||
420 | */ | ||
421 | if (b->sectors_free) | ||
422 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
423 | atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); | ||
424 | |||
425 | spin_unlock(&c->data_bucket_lock); | ||
426 | return true; | ||
427 | } | ||
428 | |||
429 | static void bch_insert_data_error(struct closure *cl) | ||
430 | { | ||
431 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
432 | |||
433 | /* | ||
434 | * Our data write just errored, which means we've got a bunch of keys to | ||
435 | * insert that point to data that wasn't succesfully written. | ||
436 | * | ||
437 | * We don't have to insert those keys but we still have to invalidate | ||
438 | * that region of the cache - so, if we just strip off all the pointers | ||
439 | * from the keys we'll accomplish just that. | ||
440 | */ | ||
441 | |||
442 | struct bkey *src = op->keys.bottom, *dst = op->keys.bottom; | ||
443 | |||
444 | while (src != op->keys.top) { | ||
445 | struct bkey *n = bkey_next(src); | ||
446 | |||
447 | SET_KEY_PTRS(src, 0); | ||
448 | bkey_copy(dst, src); | ||
449 | |||
450 | dst = bkey_next(dst); | ||
451 | src = n; | ||
452 | } | ||
453 | |||
454 | op->keys.top = dst; | ||
455 | |||
456 | bch_journal(cl); | ||
457 | } | ||
458 | |||
459 | static void bch_insert_data_endio(struct bio *bio, int error) | ||
460 | { | ||
461 | struct closure *cl = bio->bi_private; | ||
462 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
463 | struct search *s = container_of(op, struct search, op); | ||
464 | |||
465 | if (error) { | ||
466 | /* TODO: We could try to recover from this. */ | ||
467 | if (s->writeback) | ||
468 | s->error = error; | ||
469 | else if (s->write) | ||
470 | set_closure_fn(cl, bch_insert_data_error, bcache_wq); | ||
471 | else | ||
472 | set_closure_fn(cl, NULL, NULL); | ||
473 | } | ||
474 | |||
475 | bch_bbio_endio(op->c, bio, error, "writing data to cache"); | ||
476 | } | ||
477 | |||
478 | static void bch_insert_data_loop(struct closure *cl) | ||
479 | { | ||
480 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
481 | struct search *s = container_of(op, struct search, op); | ||
482 | struct bio *bio = op->cache_bio, *n; | ||
483 | |||
484 | if (op->skip) | ||
485 | return bio_invalidate(cl); | ||
486 | |||
487 | if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { | ||
488 | set_gc_sectors(op->c); | ||
489 | bch_queue_gc(op->c); | ||
490 | } | ||
491 | |||
492 | do { | ||
493 | unsigned i; | ||
494 | struct bkey *k; | ||
495 | struct bio_set *split = s->d | ||
496 | ? s->d->bio_split : op->c->bio_split; | ||
497 | |||
498 | /* 1 for the device pointer and 1 for the chksum */ | ||
499 | if (bch_keylist_realloc(&op->keys, | ||
500 | 1 + (op->csum ? 1 : 0), | ||
501 | op->c)) | ||
502 | continue_at(cl, bch_journal, bcache_wq); | ||
503 | |||
504 | k = op->keys.top; | ||
505 | bkey_init(k); | ||
506 | SET_KEY_INODE(k, op->inode); | ||
507 | SET_KEY_OFFSET(k, bio->bi_sector); | ||
508 | |||
509 | if (!bch_alloc_sectors(k, bio_sectors(bio), s)) | ||
510 | goto err; | ||
511 | |||
512 | n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); | ||
513 | if (!n) { | ||
514 | __bkey_put(op->c, k); | ||
515 | continue_at(cl, bch_insert_data_loop, bcache_wq); | ||
516 | } | ||
517 | |||
518 | n->bi_end_io = bch_insert_data_endio; | ||
519 | n->bi_private = cl; | ||
520 | |||
521 | if (s->writeback) { | ||
522 | SET_KEY_DIRTY(k, true); | ||
523 | |||
524 | for (i = 0; i < KEY_PTRS(k); i++) | ||
525 | SET_GC_MARK(PTR_BUCKET(op->c, k, i), | ||
526 | GC_MARK_DIRTY); | ||
527 | } | ||
528 | |||
529 | SET_KEY_CSUM(k, op->csum); | ||
530 | if (KEY_CSUM(k)) | ||
531 | bio_csum(n, k); | ||
532 | |||
533 | pr_debug("%s", pkey(k)); | ||
534 | bch_keylist_push(&op->keys); | ||
535 | |||
536 | trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev); | ||
537 | n->bi_rw |= REQ_WRITE; | ||
538 | bch_submit_bbio(n, op->c, k, 0); | ||
539 | } while (n != bio); | ||
540 | |||
541 | op->insert_data_done = true; | ||
542 | continue_at(cl, bch_journal, bcache_wq); | ||
543 | err: | ||
544 | /* bch_alloc_sectors() blocks if s->writeback = true */ | ||
545 | BUG_ON(s->writeback); | ||
546 | |||
547 | /* | ||
548 | * But if it's not a writeback write we'd rather just bail out if | ||
549 | * there aren't any buckets ready to write to - it might take awhile and | ||
550 | * we might be starving btree writes for gc or something. | ||
551 | */ | ||
552 | |||
553 | if (s->write) { | ||
554 | /* | ||
555 | * Writethrough write: We can't complete the write until we've | ||
556 | * updated the index. But we don't want to delay the write while | ||
557 | * we wait for buckets to be freed up, so just invalidate the | ||
558 | * rest of the write. | ||
559 | */ | ||
560 | op->skip = true; | ||
561 | return bio_invalidate(cl); | ||
562 | } else { | ||
563 | /* | ||
564 | * From a cache miss, we can just insert the keys for the data | ||
565 | * we have written or bail out if we didn't do anything. | ||
566 | */ | ||
567 | op->insert_data_done = true; | ||
568 | bio_put(bio); | ||
569 | |||
570 | if (!bch_keylist_empty(&op->keys)) | ||
571 | continue_at(cl, bch_journal, bcache_wq); | ||
572 | else | ||
573 | closure_return(cl); | ||
574 | } | ||
575 | } | ||
576 | |||
577 | /** | ||
578 | * bch_insert_data - stick some data in the cache | ||
579 | * | ||
580 | * This is the starting point for any data to end up in a cache device; it could | ||
581 | * be from a normal write, or a writeback write, or a write to a flash only | ||
582 | * volume - it's also used by the moving garbage collector to compact data in | ||
583 | * mostly empty buckets. | ||
584 | * | ||
585 | * It first writes the data to the cache, creating a list of keys to be inserted | ||
586 | * (if the data had to be fragmented there will be multiple keys); after the | ||
587 | * data is written it calls bch_journal, and after the keys have been added to | ||
588 | * the next journal write they're inserted into the btree. | ||
589 | * | ||
590 | * It inserts the data in op->cache_bio; bi_sector is used for the key offset, | ||
591 | * and op->inode is used for the key inode. | ||
592 | * | ||
593 | * If op->skip is true, instead of inserting the data it invalidates the region | ||
594 | * of the cache represented by op->cache_bio and op->inode. | ||
595 | */ | ||
596 | void bch_insert_data(struct closure *cl) | ||
597 | { | ||
598 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
599 | |||
600 | bch_keylist_init(&op->keys); | ||
601 | bio_get(op->cache_bio); | ||
602 | bch_insert_data_loop(cl); | ||
603 | } | ||
604 | |||
605 | void bch_btree_insert_async(struct closure *cl) | ||
606 | { | ||
607 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
608 | struct search *s = container_of(op, struct search, op); | ||
609 | |||
610 | if (bch_btree_insert(op, op->c)) { | ||
611 | s->error = -ENOMEM; | ||
612 | op->insert_data_done = true; | ||
613 | } | ||
614 | |||
615 | if (op->insert_data_done) { | ||
616 | bch_keylist_free(&op->keys); | ||
617 | closure_return(cl); | ||
618 | } else | ||
619 | continue_at(cl, bch_insert_data_loop, bcache_wq); | ||
620 | } | ||
621 | |||
622 | /* Common code for the make_request functions */ | ||
623 | |||
624 | static void request_endio(struct bio *bio, int error) | ||
625 | { | ||
626 | struct closure *cl = bio->bi_private; | ||
627 | |||
628 | if (error) { | ||
629 | struct search *s = container_of(cl, struct search, cl); | ||
630 | s->error = error; | ||
631 | /* Only cache read errors are recoverable */ | ||
632 | s->recoverable = false; | ||
633 | } | ||
634 | |||
635 | bio_put(bio); | ||
636 | closure_put(cl); | ||
637 | } | ||
638 | |||
639 | void bch_cache_read_endio(struct bio *bio, int error) | ||
640 | { | ||
641 | struct bbio *b = container_of(bio, struct bbio, bio); | ||
642 | struct closure *cl = bio->bi_private; | ||
643 | struct search *s = container_of(cl, struct search, cl); | ||
644 | |||
645 | /* | ||
646 | * If the bucket was reused while our bio was in flight, we might have | ||
647 | * read the wrong data. Set s->error but not error so it doesn't get | ||
648 | * counted against the cache device, but we'll still reread the data | ||
649 | * from the backing device. | ||
650 | */ | ||
651 | |||
652 | if (error) | ||
653 | s->error = error; | ||
654 | else if (ptr_stale(s->op.c, &b->key, 0)) { | ||
655 | atomic_long_inc(&s->op.c->cache_read_races); | ||
656 | s->error = -EINTR; | ||
657 | } | ||
658 | |||
659 | bch_bbio_endio(s->op.c, bio, error, "reading from cache"); | ||
660 | } | ||
661 | |||
662 | static void bio_complete(struct search *s) | ||
663 | { | ||
664 | if (s->orig_bio) { | ||
665 | int cpu, rw = bio_data_dir(s->orig_bio); | ||
666 | unsigned long duration = jiffies - s->start_time; | ||
667 | |||
668 | cpu = part_stat_lock(); | ||
669 | part_round_stats(cpu, &s->d->disk->part0); | ||
670 | part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); | ||
671 | part_stat_unlock(); | ||
672 | |||
673 | trace_bcache_request_end(s, s->orig_bio); | ||
674 | bio_endio(s->orig_bio, s->error); | ||
675 | s->orig_bio = NULL; | ||
676 | } | ||
677 | } | ||
678 | |||
679 | static void do_bio_hook(struct search *s) | ||
680 | { | ||
681 | struct bio *bio = &s->bio.bio; | ||
682 | memcpy(bio, s->orig_bio, sizeof(struct bio)); | ||
683 | |||
684 | bio->bi_end_io = request_endio; | ||
685 | bio->bi_private = &s->cl; | ||
686 | atomic_set(&bio->bi_cnt, 3); | ||
687 | } | ||
688 | |||
689 | static void search_free(struct closure *cl) | ||
690 | { | ||
691 | struct search *s = container_of(cl, struct search, cl); | ||
692 | bio_complete(s); | ||
693 | |||
694 | if (s->op.cache_bio) | ||
695 | bio_put(s->op.cache_bio); | ||
696 | |||
697 | if (s->unaligned_bvec) | ||
698 | mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); | ||
699 | |||
700 | closure_debug_destroy(cl); | ||
701 | mempool_free(s, s->d->c->search); | ||
702 | } | ||
703 | |||
704 | static struct search *search_alloc(struct bio *bio, struct bcache_device *d) | ||
705 | { | ||
706 | struct bio_vec *bv; | ||
707 | struct search *s = mempool_alloc(d->c->search, GFP_NOIO); | ||
708 | memset(s, 0, offsetof(struct search, op.keys)); | ||
709 | |||
710 | __closure_init(&s->cl, NULL); | ||
711 | |||
712 | s->op.inode = d->id; | ||
713 | s->op.c = d->c; | ||
714 | s->d = d; | ||
715 | s->op.lock = -1; | ||
716 | s->task = current; | ||
717 | s->orig_bio = bio; | ||
718 | s->write = (bio->bi_rw & REQ_WRITE) != 0; | ||
719 | s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; | ||
720 | s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; | ||
721 | s->recoverable = 1; | ||
722 | s->start_time = jiffies; | ||
723 | do_bio_hook(s); | ||
724 | |||
725 | if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) { | ||
726 | bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO); | ||
727 | memcpy(bv, bio_iovec(bio), | ||
728 | sizeof(struct bio_vec) * bio_segments(bio)); | ||
729 | |||
730 | s->bio.bio.bi_io_vec = bv; | ||
731 | s->unaligned_bvec = 1; | ||
732 | } | ||
733 | |||
734 | return s; | ||
735 | } | ||
736 | |||
737 | static void btree_read_async(struct closure *cl) | ||
738 | { | ||
739 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
740 | |||
741 | int ret = btree_root(search_recurse, op->c, op); | ||
742 | |||
743 | if (ret == -EAGAIN) | ||
744 | continue_at(cl, btree_read_async, bcache_wq); | ||
745 | |||
746 | closure_return(cl); | ||
747 | } | ||
748 | |||
749 | /* Cached devices */ | ||
750 | |||
751 | static void cached_dev_bio_complete(struct closure *cl) | ||
752 | { | ||
753 | struct search *s = container_of(cl, struct search, cl); | ||
754 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
755 | |||
756 | search_free(cl); | ||
757 | cached_dev_put(dc); | ||
758 | } | ||
759 | |||
760 | /* Process reads */ | ||
761 | |||
762 | static void cached_dev_read_complete(struct closure *cl) | ||
763 | { | ||
764 | struct search *s = container_of(cl, struct search, cl); | ||
765 | |||
766 | if (s->op.insert_collision) | ||
767 | bch_mark_cache_miss_collision(s); | ||
768 | |||
769 | if (s->op.cache_bio) { | ||
770 | int i; | ||
771 | struct bio_vec *bv; | ||
772 | |||
773 | __bio_for_each_segment(bv, s->op.cache_bio, i, 0) | ||
774 | __free_page(bv->bv_page); | ||
775 | } | ||
776 | |||
777 | cached_dev_bio_complete(cl); | ||
778 | } | ||
779 | |||
780 | static void request_read_error(struct closure *cl) | ||
781 | { | ||
782 | struct search *s = container_of(cl, struct search, cl); | ||
783 | struct bio_vec *bv; | ||
784 | int i; | ||
785 | |||
786 | if (s->recoverable) { | ||
787 | /* The cache read failed, but we can retry from the backing | ||
788 | * device. | ||
789 | */ | ||
790 | pr_debug("recovering at sector %llu", | ||
791 | (uint64_t) s->orig_bio->bi_sector); | ||
792 | |||
793 | s->error = 0; | ||
794 | bv = s->bio.bio.bi_io_vec; | ||
795 | do_bio_hook(s); | ||
796 | s->bio.bio.bi_io_vec = bv; | ||
797 | |||
798 | if (!s->unaligned_bvec) | ||
799 | bio_for_each_segment(bv, s->orig_bio, i) | ||
800 | bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; | ||
801 | else | ||
802 | memcpy(s->bio.bio.bi_io_vec, | ||
803 | bio_iovec(s->orig_bio), | ||
804 | sizeof(struct bio_vec) * | ||
805 | bio_segments(s->orig_bio)); | ||
806 | |||
807 | /* XXX: invalidate cache */ | ||
808 | |||
809 | trace_bcache_read_retry(&s->bio.bio); | ||
810 | closure_bio_submit(&s->bio.bio, &s->cl, s->d); | ||
811 | } | ||
812 | |||
813 | continue_at(cl, cached_dev_read_complete, NULL); | ||
814 | } | ||
815 | |||
816 | static void request_read_done(struct closure *cl) | ||
817 | { | ||
818 | struct search *s = container_of(cl, struct search, cl); | ||
819 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
820 | |||
821 | /* | ||
822 | * s->cache_bio != NULL implies that we had a cache miss; cache_bio now | ||
823 | * contains data ready to be inserted into the cache. | ||
824 | * | ||
825 | * First, we copy the data we just read from cache_bio's bounce buffers | ||
826 | * to the buffers the original bio pointed to: | ||
827 | */ | ||
828 | |||
829 | if (s->op.cache_bio) { | ||
830 | struct bio_vec *src, *dst; | ||
831 | unsigned src_offset, dst_offset, bytes; | ||
832 | void *dst_ptr; | ||
833 | |||
834 | bio_reset(s->op.cache_bio); | ||
835 | s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; | ||
836 | s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; | ||
837 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; | ||
838 | bch_bio_map(s->op.cache_bio, NULL); | ||
839 | |||
840 | src = bio_iovec(s->op.cache_bio); | ||
841 | dst = bio_iovec(s->cache_miss); | ||
842 | src_offset = src->bv_offset; | ||
843 | dst_offset = dst->bv_offset; | ||
844 | dst_ptr = kmap(dst->bv_page); | ||
845 | |||
846 | while (1) { | ||
847 | if (dst_offset == dst->bv_offset + dst->bv_len) { | ||
848 | kunmap(dst->bv_page); | ||
849 | dst++; | ||
850 | if (dst == bio_iovec_idx(s->cache_miss, | ||
851 | s->cache_miss->bi_vcnt)) | ||
852 | break; | ||
853 | |||
854 | dst_offset = dst->bv_offset; | ||
855 | dst_ptr = kmap(dst->bv_page); | ||
856 | } | ||
857 | |||
858 | if (src_offset == src->bv_offset + src->bv_len) { | ||
859 | src++; | ||
860 | if (src == bio_iovec_idx(s->op.cache_bio, | ||
861 | s->op.cache_bio->bi_vcnt)) | ||
862 | BUG(); | ||
863 | |||
864 | src_offset = src->bv_offset; | ||
865 | } | ||
866 | |||
867 | bytes = min(dst->bv_offset + dst->bv_len - dst_offset, | ||
868 | src->bv_offset + src->bv_len - src_offset); | ||
869 | |||
870 | memcpy(dst_ptr + dst_offset, | ||
871 | page_address(src->bv_page) + src_offset, | ||
872 | bytes); | ||
873 | |||
874 | src_offset += bytes; | ||
875 | dst_offset += bytes; | ||
876 | } | ||
877 | |||
878 | bio_put(s->cache_miss); | ||
879 | s->cache_miss = NULL; | ||
880 | } | ||
881 | |||
882 | if (verify(dc, &s->bio.bio) && s->recoverable) | ||
883 | bch_data_verify(s); | ||
884 | |||
885 | bio_complete(s); | ||
886 | |||
887 | if (s->op.cache_bio && | ||
888 | !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { | ||
889 | s->op.type = BTREE_REPLACE; | ||
890 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | ||
891 | } | ||
892 | |||
893 | continue_at(cl, cached_dev_read_complete, NULL); | ||
894 | } | ||
895 | |||
896 | static void request_read_done_bh(struct closure *cl) | ||
897 | { | ||
898 | struct search *s = container_of(cl, struct search, cl); | ||
899 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
900 | |||
901 | bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); | ||
902 | |||
903 | if (s->error) | ||
904 | continue_at_nobarrier(cl, request_read_error, bcache_wq); | ||
905 | else if (s->op.cache_bio || verify(dc, &s->bio.bio)) | ||
906 | continue_at_nobarrier(cl, request_read_done, bcache_wq); | ||
907 | else | ||
908 | continue_at_nobarrier(cl, cached_dev_read_complete, NULL); | ||
909 | } | ||
910 | |||
911 | static int cached_dev_cache_miss(struct btree *b, struct search *s, | ||
912 | struct bio *bio, unsigned sectors) | ||
913 | { | ||
914 | int ret = 0; | ||
915 | unsigned reada; | ||
916 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
917 | struct bio *miss; | ||
918 | |||
919 | miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | ||
920 | if (!miss) | ||
921 | return -EAGAIN; | ||
922 | |||
923 | if (miss == bio) | ||
924 | s->op.lookup_done = true; | ||
925 | |||
926 | miss->bi_end_io = request_endio; | ||
927 | miss->bi_private = &s->cl; | ||
928 | |||
929 | if (s->cache_miss || s->op.skip) | ||
930 | goto out_submit; | ||
931 | |||
932 | if (miss != bio || | ||
933 | (bio->bi_rw & REQ_RAHEAD) || | ||
934 | (bio->bi_rw & REQ_META) || | ||
935 | s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA) | ||
936 | reada = 0; | ||
937 | else { | ||
938 | reada = min(dc->readahead >> 9, | ||
939 | sectors - bio_sectors(miss)); | ||
940 | |||
941 | if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev)) | ||
942 | reada = bdev_sectors(miss->bi_bdev) - bio_end(miss); | ||
943 | } | ||
944 | |||
945 | s->cache_bio_sectors = bio_sectors(miss) + reada; | ||
946 | s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, | ||
947 | DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), | ||
948 | dc->disk.bio_split); | ||
949 | |||
950 | if (!s->op.cache_bio) | ||
951 | goto out_submit; | ||
952 | |||
953 | s->op.cache_bio->bi_sector = miss->bi_sector; | ||
954 | s->op.cache_bio->bi_bdev = miss->bi_bdev; | ||
955 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; | ||
956 | |||
957 | s->op.cache_bio->bi_end_io = request_endio; | ||
958 | s->op.cache_bio->bi_private = &s->cl; | ||
959 | |||
960 | /* btree_search_recurse()'s btree iterator is no good anymore */ | ||
961 | ret = -EINTR; | ||
962 | if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) | ||
963 | goto out_put; | ||
964 | |||
965 | bch_bio_map(s->op.cache_bio, NULL); | ||
966 | if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) | ||
967 | goto out_put; | ||
968 | |||
969 | s->cache_miss = miss; | ||
970 | bio_get(s->op.cache_bio); | ||
971 | |||
972 | trace_bcache_cache_miss(s->orig_bio); | ||
973 | closure_bio_submit(s->op.cache_bio, &s->cl, s->d); | ||
974 | |||
975 | return ret; | ||
976 | out_put: | ||
977 | bio_put(s->op.cache_bio); | ||
978 | s->op.cache_bio = NULL; | ||
979 | out_submit: | ||
980 | closure_bio_submit(miss, &s->cl, s->d); | ||
981 | return ret; | ||
982 | } | ||
983 | |||
984 | static void request_read(struct cached_dev *dc, struct search *s) | ||
985 | { | ||
986 | struct closure *cl = &s->cl; | ||
987 | |||
988 | check_should_skip(dc, s); | ||
989 | closure_call(&s->op.cl, btree_read_async, NULL, cl); | ||
990 | |||
991 | continue_at(cl, request_read_done_bh, NULL); | ||
992 | } | ||
993 | |||
994 | /* Process writes */ | ||
995 | |||
996 | static void cached_dev_write_complete(struct closure *cl) | ||
997 | { | ||
998 | struct search *s = container_of(cl, struct search, cl); | ||
999 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
1000 | |||
1001 | up_read_non_owner(&dc->writeback_lock); | ||
1002 | cached_dev_bio_complete(cl); | ||
1003 | } | ||
1004 | |||
1005 | static bool should_writeback(struct cached_dev *dc, struct bio *bio) | ||
1006 | { | ||
1007 | unsigned threshold = (bio->bi_rw & REQ_SYNC) | ||
1008 | ? CUTOFF_WRITEBACK_SYNC | ||
1009 | : CUTOFF_WRITEBACK; | ||
1010 | |||
1011 | return !atomic_read(&dc->disk.detaching) && | ||
1012 | cache_mode(dc, bio) == CACHE_MODE_WRITEBACK && | ||
1013 | dc->disk.c->gc_stats.in_use < threshold; | ||
1014 | } | ||
1015 | |||
1016 | static void request_write(struct cached_dev *dc, struct search *s) | ||
1017 | { | ||
1018 | struct closure *cl = &s->cl; | ||
1019 | struct bio *bio = &s->bio.bio; | ||
1020 | struct bkey start, end; | ||
1021 | start = KEY(dc->disk.id, bio->bi_sector, 0); | ||
1022 | end = KEY(dc->disk.id, bio_end(bio), 0); | ||
1023 | |||
1024 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); | ||
1025 | |||
1026 | check_should_skip(dc, s); | ||
1027 | down_read_non_owner(&dc->writeback_lock); | ||
1028 | |||
1029 | if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { | ||
1030 | s->op.skip = false; | ||
1031 | s->writeback = true; | ||
1032 | } | ||
1033 | |||
1034 | if (bio->bi_rw & REQ_DISCARD) | ||
1035 | goto skip; | ||
1036 | |||
1037 | if (s->op.skip) | ||
1038 | goto skip; | ||
1039 | |||
1040 | if (should_writeback(dc, s->orig_bio)) | ||
1041 | s->writeback = true; | ||
1042 | |||
1043 | if (!s->writeback) { | ||
1044 | s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, | ||
1045 | dc->disk.bio_split); | ||
1046 | |||
1047 | trace_bcache_writethrough(s->orig_bio); | ||
1048 | closure_bio_submit(bio, cl, s->d); | ||
1049 | } else { | ||
1050 | s->op.cache_bio = bio; | ||
1051 | trace_bcache_writeback(s->orig_bio); | ||
1052 | bch_writeback_add(dc, bio_sectors(bio)); | ||
1053 | } | ||
1054 | out: | ||
1055 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | ||
1056 | continue_at(cl, cached_dev_write_complete, NULL); | ||
1057 | skip: | ||
1058 | s->op.skip = true; | ||
1059 | s->op.cache_bio = s->orig_bio; | ||
1060 | bio_get(s->op.cache_bio); | ||
1061 | trace_bcache_write_skip(s->orig_bio); | ||
1062 | |||
1063 | if ((bio->bi_rw & REQ_DISCARD) && | ||
1064 | !blk_queue_discard(bdev_get_queue(dc->bdev))) | ||
1065 | goto out; | ||
1066 | |||
1067 | closure_bio_submit(bio, cl, s->d); | ||
1068 | goto out; | ||
1069 | } | ||
1070 | |||
1071 | static void request_nodata(struct cached_dev *dc, struct search *s) | ||
1072 | { | ||
1073 | struct closure *cl = &s->cl; | ||
1074 | struct bio *bio = &s->bio.bio; | ||
1075 | |||
1076 | if (bio->bi_rw & REQ_DISCARD) { | ||
1077 | request_write(dc, s); | ||
1078 | return; | ||
1079 | } | ||
1080 | |||
1081 | if (s->op.flush_journal) | ||
1082 | bch_journal_meta(s->op.c, cl); | ||
1083 | |||
1084 | closure_bio_submit(bio, cl, s->d); | ||
1085 | |||
1086 | continue_at(cl, cached_dev_bio_complete, NULL); | ||
1087 | } | ||
1088 | |||
1089 | /* Cached devices - read & write stuff */ | ||
1090 | |||
1091 | int bch_get_congested(struct cache_set *c) | ||
1092 | { | ||
1093 | int i; | ||
1094 | |||
1095 | if (!c->congested_read_threshold_us && | ||
1096 | !c->congested_write_threshold_us) | ||
1097 | return 0; | ||
1098 | |||
1099 | i = (local_clock_us() - c->congested_last_us) / 1024; | ||
1100 | if (i < 0) | ||
1101 | return 0; | ||
1102 | |||
1103 | i += atomic_read(&c->congested); | ||
1104 | if (i >= 0) | ||
1105 | return 0; | ||
1106 | |||
1107 | i += CONGESTED_MAX; | ||
1108 | |||
1109 | return i <= 0 ? 1 : fract_exp_two(i, 6); | ||
1110 | } | ||
1111 | |||
1112 | static void add_sequential(struct task_struct *t) | ||
1113 | { | ||
1114 | ewma_add(t->sequential_io_avg, | ||
1115 | t->sequential_io, 8, 0); | ||
1116 | |||
1117 | t->sequential_io = 0; | ||
1118 | } | ||
1119 | |||
1120 | static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) | ||
1121 | { | ||
1122 | return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; | ||
1123 | } | ||
1124 | |||
1125 | static void check_should_skip(struct cached_dev *dc, struct search *s) | ||
1126 | { | ||
1127 | struct cache_set *c = s->op.c; | ||
1128 | struct bio *bio = &s->bio.bio; | ||
1129 | |||
1130 | long rand; | ||
1131 | int cutoff = bch_get_congested(c); | ||
1132 | unsigned mode = cache_mode(dc, bio); | ||
1133 | |||
1134 | if (atomic_read(&dc->disk.detaching) || | ||
1135 | c->gc_stats.in_use > CUTOFF_CACHE_ADD || | ||
1136 | (bio->bi_rw & REQ_DISCARD)) | ||
1137 | goto skip; | ||
1138 | |||
1139 | if (mode == CACHE_MODE_NONE || | ||
1140 | (mode == CACHE_MODE_WRITEAROUND && | ||
1141 | (bio->bi_rw & REQ_WRITE))) | ||
1142 | goto skip; | ||
1143 | |||
1144 | if (bio->bi_sector & (c->sb.block_size - 1) || | ||
1145 | bio_sectors(bio) & (c->sb.block_size - 1)) { | ||
1146 | pr_debug("skipping unaligned io"); | ||
1147 | goto skip; | ||
1148 | } | ||
1149 | |||
1150 | if (!cutoff) { | ||
1151 | cutoff = dc->sequential_cutoff >> 9; | ||
1152 | |||
1153 | if (!cutoff) | ||
1154 | goto rescale; | ||
1155 | |||
1156 | if (mode == CACHE_MODE_WRITEBACK && | ||
1157 | (bio->bi_rw & REQ_WRITE) && | ||
1158 | (bio->bi_rw & REQ_SYNC)) | ||
1159 | goto rescale; | ||
1160 | } | ||
1161 | |||
1162 | if (dc->sequential_merge) { | ||
1163 | struct io *i; | ||
1164 | |||
1165 | spin_lock(&dc->io_lock); | ||
1166 | |||
1167 | hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) | ||
1168 | if (i->last == bio->bi_sector && | ||
1169 | time_before(jiffies, i->jiffies)) | ||
1170 | goto found; | ||
1171 | |||
1172 | i = list_first_entry(&dc->io_lru, struct io, lru); | ||
1173 | |||
1174 | add_sequential(s->task); | ||
1175 | i->sequential = 0; | ||
1176 | found: | ||
1177 | if (i->sequential + bio->bi_size > i->sequential) | ||
1178 | i->sequential += bio->bi_size; | ||
1179 | |||
1180 | i->last = bio_end(bio); | ||
1181 | i->jiffies = jiffies + msecs_to_jiffies(5000); | ||
1182 | s->task->sequential_io = i->sequential; | ||
1183 | |||
1184 | hlist_del(&i->hash); | ||
1185 | hlist_add_head(&i->hash, iohash(dc, i->last)); | ||
1186 | list_move_tail(&i->lru, &dc->io_lru); | ||
1187 | |||
1188 | spin_unlock(&dc->io_lock); | ||
1189 | } else { | ||
1190 | s->task->sequential_io = bio->bi_size; | ||
1191 | |||
1192 | add_sequential(s->task); | ||
1193 | } | ||
1194 | |||
1195 | rand = get_random_int(); | ||
1196 | cutoff -= bitmap_weight(&rand, BITS_PER_LONG); | ||
1197 | |||
1198 | if (cutoff <= (int) (max(s->task->sequential_io, | ||
1199 | s->task->sequential_io_avg) >> 9)) | ||
1200 | goto skip; | ||
1201 | |||
1202 | rescale: | ||
1203 | bch_rescale_priorities(c, bio_sectors(bio)); | ||
1204 | return; | ||
1205 | skip: | ||
1206 | bch_mark_sectors_bypassed(s, bio_sectors(bio)); | ||
1207 | s->op.skip = true; | ||
1208 | } | ||
1209 | |||
1210 | static void cached_dev_make_request(struct request_queue *q, struct bio *bio) | ||
1211 | { | ||
1212 | struct search *s; | ||
1213 | struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; | ||
1214 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | ||
1215 | int cpu, rw = bio_data_dir(bio); | ||
1216 | |||
1217 | cpu = part_stat_lock(); | ||
1218 | part_stat_inc(cpu, &d->disk->part0, ios[rw]); | ||
1219 | part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); | ||
1220 | part_stat_unlock(); | ||
1221 | |||
1222 | bio->bi_bdev = dc->bdev; | ||
1223 | bio->bi_sector += dc->sb.data_offset; | ||
1224 | |||
1225 | if (cached_dev_get(dc)) { | ||
1226 | s = search_alloc(bio, d); | ||
1227 | trace_bcache_request_start(s, bio); | ||
1228 | |||
1229 | if (!bio_has_data(bio)) | ||
1230 | request_nodata(dc, s); | ||
1231 | else if (rw) | ||
1232 | request_write(dc, s); | ||
1233 | else | ||
1234 | request_read(dc, s); | ||
1235 | } else { | ||
1236 | if ((bio->bi_rw & REQ_DISCARD) && | ||
1237 | !blk_queue_discard(bdev_get_queue(dc->bdev))) | ||
1238 | bio_endio(bio, 0); | ||
1239 | else | ||
1240 | bch_generic_make_request(bio, &d->bio_split_hook); | ||
1241 | } | ||
1242 | } | ||
1243 | |||
1244 | static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, | ||
1245 | unsigned int cmd, unsigned long arg) | ||
1246 | { | ||
1247 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | ||
1248 | return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); | ||
1249 | } | ||
1250 | |||
1251 | static int cached_dev_congested(void *data, int bits) | ||
1252 | { | ||
1253 | struct bcache_device *d = data; | ||
1254 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); | ||
1255 | struct request_queue *q = bdev_get_queue(dc->bdev); | ||
1256 | int ret = 0; | ||
1257 | |||
1258 | if (bdi_congested(&q->backing_dev_info, bits)) | ||
1259 | return 1; | ||
1260 | |||
1261 | if (cached_dev_get(dc)) { | ||
1262 | unsigned i; | ||
1263 | struct cache *ca; | ||
1264 | |||
1265 | for_each_cache(ca, d->c, i) { | ||
1266 | q = bdev_get_queue(ca->bdev); | ||
1267 | ret |= bdi_congested(&q->backing_dev_info, bits); | ||
1268 | } | ||
1269 | |||
1270 | cached_dev_put(dc); | ||
1271 | } | ||
1272 | |||
1273 | return ret; | ||
1274 | } | ||
1275 | |||
1276 | void bch_cached_dev_request_init(struct cached_dev *dc) | ||
1277 | { | ||
1278 | struct gendisk *g = dc->disk.disk; | ||
1279 | |||
1280 | g->queue->make_request_fn = cached_dev_make_request; | ||
1281 | g->queue->backing_dev_info.congested_fn = cached_dev_congested; | ||
1282 | dc->disk.cache_miss = cached_dev_cache_miss; | ||
1283 | dc->disk.ioctl = cached_dev_ioctl; | ||
1284 | } | ||
1285 | |||
1286 | /* Flash backed devices */ | ||
1287 | |||
1288 | static int flash_dev_cache_miss(struct btree *b, struct search *s, | ||
1289 | struct bio *bio, unsigned sectors) | ||
1290 | { | ||
1291 | /* Zero fill bio */ | ||
1292 | |||
1293 | while (bio->bi_idx != bio->bi_vcnt) { | ||
1294 | struct bio_vec *bv = bio_iovec(bio); | ||
1295 | unsigned j = min(bv->bv_len >> 9, sectors); | ||
1296 | |||
1297 | void *p = kmap(bv->bv_page); | ||
1298 | memset(p + bv->bv_offset, 0, j << 9); | ||
1299 | kunmap(bv->bv_page); | ||
1300 | |||
1301 | bv->bv_len -= j << 9; | ||
1302 | bv->bv_offset += j << 9; | ||
1303 | |||
1304 | if (bv->bv_len) | ||
1305 | return 0; | ||
1306 | |||
1307 | bio->bi_sector += j; | ||
1308 | bio->bi_size -= j << 9; | ||
1309 | |||
1310 | bio->bi_idx++; | ||
1311 | sectors -= j; | ||
1312 | } | ||
1313 | |||
1314 | s->op.lookup_done = true; | ||
1315 | |||
1316 | return 0; | ||
1317 | } | ||
1318 | |||
1319 | static void flash_dev_make_request(struct request_queue *q, struct bio *bio) | ||
1320 | { | ||
1321 | struct search *s; | ||
1322 | struct closure *cl; | ||
1323 | struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; | ||
1324 | int cpu, rw = bio_data_dir(bio); | ||
1325 | |||
1326 | cpu = part_stat_lock(); | ||
1327 | part_stat_inc(cpu, &d->disk->part0, ios[rw]); | ||
1328 | part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); | ||
1329 | part_stat_unlock(); | ||
1330 | |||
1331 | s = search_alloc(bio, d); | ||
1332 | cl = &s->cl; | ||
1333 | bio = &s->bio.bio; | ||
1334 | |||
1335 | trace_bcache_request_start(s, bio); | ||
1336 | |||
1337 | if (bio_has_data(bio) && !rw) { | ||
1338 | closure_call(&s->op.cl, btree_read_async, NULL, cl); | ||
1339 | } else if (bio_has_data(bio) || s->op.skip) { | ||
1340 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, | ||
1341 | &KEY(d->id, bio->bi_sector, 0), | ||
1342 | &KEY(d->id, bio_end(bio), 0)); | ||
1343 | |||
1344 | s->writeback = true; | ||
1345 | s->op.cache_bio = bio; | ||
1346 | |||
1347 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | ||
1348 | } else { | ||
1349 | /* No data - probably a cache flush */ | ||
1350 | if (s->op.flush_journal) | ||
1351 | bch_journal_meta(s->op.c, cl); | ||
1352 | } | ||
1353 | |||
1354 | continue_at(cl, search_free, NULL); | ||
1355 | } | ||
1356 | |||
1357 | static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, | ||
1358 | unsigned int cmd, unsigned long arg) | ||
1359 | { | ||
1360 | return -ENOTTY; | ||
1361 | } | ||
1362 | |||
1363 | static int flash_dev_congested(void *data, int bits) | ||
1364 | { | ||
1365 | struct bcache_device *d = data; | ||
1366 | struct request_queue *q; | ||
1367 | struct cache *ca; | ||
1368 | unsigned i; | ||
1369 | int ret = 0; | ||
1370 | |||
1371 | for_each_cache(ca, d->c, i) { | ||
1372 | q = bdev_get_queue(ca->bdev); | ||
1373 | ret |= bdi_congested(&q->backing_dev_info, bits); | ||
1374 | } | ||
1375 | |||
1376 | return ret; | ||
1377 | } | ||
1378 | |||
1379 | void bch_flash_dev_request_init(struct bcache_device *d) | ||
1380 | { | ||
1381 | struct gendisk *g = d->disk; | ||
1382 | |||
1383 | g->queue->make_request_fn = flash_dev_make_request; | ||
1384 | g->queue->backing_dev_info.congested_fn = flash_dev_congested; | ||
1385 | d->cache_miss = flash_dev_cache_miss; | ||
1386 | d->ioctl = flash_dev_ioctl; | ||
1387 | } | ||
1388 | |||
1389 | void bch_request_exit(void) | ||
1390 | { | ||
1391 | #ifdef CONFIG_CGROUP_BCACHE | ||
1392 | cgroup_unload_subsys(&bcache_subsys); | ||
1393 | #endif | ||
1394 | if (bch_search_cache) | ||
1395 | kmem_cache_destroy(bch_search_cache); | ||
1396 | } | ||
1397 | |||
1398 | int __init bch_request_init(void) | ||
1399 | { | ||
1400 | bch_search_cache = KMEM_CACHE(search, 0); | ||
1401 | if (!bch_search_cache) | ||
1402 | return -ENOMEM; | ||
1403 | |||
1404 | #ifdef CONFIG_CGROUP_BCACHE | ||
1405 | cgroup_load_subsys(&bcache_subsys); | ||
1406 | init_bch_cgroup(&bcache_default_cgroup); | ||
1407 | |||
1408 | cgroup_add_cftypes(&bcache_subsys, bch_files); | ||
1409 | #endif | ||
1410 | return 0; | ||
1411 | } | ||
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h new file mode 100644 index 000000000000..254d9ab5707c --- /dev/null +++ b/drivers/md/bcache/request.h | |||
@@ -0,0 +1,62 @@ | |||
1 | #ifndef _BCACHE_REQUEST_H_ | ||
2 | #define _BCACHE_REQUEST_H_ | ||
3 | |||
4 | #include <linux/cgroup.h> | ||
5 | |||
6 | struct search { | ||
7 | /* Stack frame for bio_complete */ | ||
8 | struct closure cl; | ||
9 | |||
10 | struct bcache_device *d; | ||
11 | struct task_struct *task; | ||
12 | |||
13 | struct bbio bio; | ||
14 | struct bio *orig_bio; | ||
15 | struct bio *cache_miss; | ||
16 | unsigned cache_bio_sectors; | ||
17 | |||
18 | unsigned recoverable:1; | ||
19 | unsigned unaligned_bvec:1; | ||
20 | |||
21 | unsigned write:1; | ||
22 | unsigned writeback:1; | ||
23 | |||
24 | /* IO error returned to s->bio */ | ||
25 | short error; | ||
26 | unsigned long start_time; | ||
27 | |||
28 | /* Anything past op->keys won't get zeroed in do_bio_hook */ | ||
29 | struct btree_op op; | ||
30 | }; | ||
31 | |||
32 | void bch_cache_read_endio(struct bio *, int); | ||
33 | int bch_get_congested(struct cache_set *); | ||
34 | void bch_insert_data(struct closure *cl); | ||
35 | void bch_btree_insert_async(struct closure *); | ||
36 | void bch_cache_read_endio(struct bio *, int); | ||
37 | |||
38 | void bch_open_buckets_free(struct cache_set *); | ||
39 | int bch_open_buckets_alloc(struct cache_set *); | ||
40 | |||
41 | void bch_cached_dev_request_init(struct cached_dev *dc); | ||
42 | void bch_flash_dev_request_init(struct bcache_device *d); | ||
43 | |||
44 | extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache; | ||
45 | |||
46 | struct bch_cgroup { | ||
47 | #ifdef CONFIG_CGROUP_BCACHE | ||
48 | struct cgroup_subsys_state css; | ||
49 | #endif | ||
50 | /* | ||
51 | * We subtract one from the index into bch_cache_modes[], so that | ||
52 | * default == -1; this makes it so the rest match up with d->cache_mode, | ||
53 | * and we use d->cache_mode if cgrp->cache_mode < 0 | ||
54 | */ | ||
55 | short cache_mode; | ||
56 | bool verify; | ||
57 | struct cache_stat_collector stats; | ||
58 | }; | ||
59 | |||
60 | struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio); | ||
61 | |||
62 | #endif /* _BCACHE_REQUEST_H_ */ | ||
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c new file mode 100644 index 000000000000..64e679449c2a --- /dev/null +++ b/drivers/md/bcache/stats.c | |||
@@ -0,0 +1,246 @@ | |||
1 | /* | ||
2 | * bcache stats code | ||
3 | * | ||
4 | * Copyright 2012 Google, Inc. | ||
5 | */ | ||
6 | |||
7 | #include "bcache.h" | ||
8 | #include "stats.h" | ||
9 | #include "btree.h" | ||
10 | #include "request.h" | ||
11 | #include "sysfs.h" | ||
12 | |||
13 | /* | ||
14 | * We keep absolute totals of various statistics, and addionally a set of three | ||
15 | * rolling averages. | ||
16 | * | ||
17 | * Every so often, a timer goes off and rescales the rolling averages. | ||
18 | * accounting_rescale[] is how many times the timer has to go off before we | ||
19 | * rescale each set of numbers; that gets us half lives of 5 minutes, one hour, | ||
20 | * and one day. | ||
21 | * | ||
22 | * accounting_delay is how often the timer goes off - 22 times in 5 minutes, | ||
23 | * and accounting_weight is what we use to rescale: | ||
24 | * | ||
25 | * pow(31 / 32, 22) ~= 1/2 | ||
26 | * | ||
27 | * So that we don't have to increment each set of numbers every time we (say) | ||
28 | * get a cache hit, we increment a single atomic_t in acc->collector, and when | ||
29 | * the rescale function runs it resets the atomic counter to 0 and adds its | ||
30 | * old value to each of the exported numbers. | ||
31 | * | ||
32 | * To reduce rounding error, the numbers in struct cache_stats are all | ||
33 | * stored left shifted by 16, and scaled back in the sysfs show() function. | ||
34 | */ | ||
35 | |||
36 | static const unsigned DAY_RESCALE = 288; | ||
37 | static const unsigned HOUR_RESCALE = 12; | ||
38 | static const unsigned FIVE_MINUTE_RESCALE = 1; | ||
39 | static const unsigned accounting_delay = (HZ * 300) / 22; | ||
40 | static const unsigned accounting_weight = 32; | ||
41 | |||
42 | /* sysfs reading/writing */ | ||
43 | |||
44 | read_attribute(cache_hits); | ||
45 | read_attribute(cache_misses); | ||
46 | read_attribute(cache_bypass_hits); | ||
47 | read_attribute(cache_bypass_misses); | ||
48 | read_attribute(cache_hit_ratio); | ||
49 | read_attribute(cache_readaheads); | ||
50 | read_attribute(cache_miss_collisions); | ||
51 | read_attribute(bypassed); | ||
52 | |||
53 | SHOW(bch_stats) | ||
54 | { | ||
55 | struct cache_stats *s = | ||
56 | container_of(kobj, struct cache_stats, kobj); | ||
57 | #define var(stat) (s->stat >> 16) | ||
58 | var_print(cache_hits); | ||
59 | var_print(cache_misses); | ||
60 | var_print(cache_bypass_hits); | ||
61 | var_print(cache_bypass_misses); | ||
62 | |||
63 | sysfs_print(cache_hit_ratio, | ||
64 | DIV_SAFE(var(cache_hits) * 100, | ||
65 | var(cache_hits) + var(cache_misses))); | ||
66 | |||
67 | var_print(cache_readaheads); | ||
68 | var_print(cache_miss_collisions); | ||
69 | sysfs_hprint(bypassed, var(sectors_bypassed) << 9); | ||
70 | #undef var | ||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | STORE(bch_stats) | ||
75 | { | ||
76 | return size; | ||
77 | } | ||
78 | |||
79 | static void bch_stats_release(struct kobject *k) | ||
80 | { | ||
81 | } | ||
82 | |||
83 | static struct attribute *bch_stats_files[] = { | ||
84 | &sysfs_cache_hits, | ||
85 | &sysfs_cache_misses, | ||
86 | &sysfs_cache_bypass_hits, | ||
87 | &sysfs_cache_bypass_misses, | ||
88 | &sysfs_cache_hit_ratio, | ||
89 | &sysfs_cache_readaheads, | ||
90 | &sysfs_cache_miss_collisions, | ||
91 | &sysfs_bypassed, | ||
92 | NULL | ||
93 | }; | ||
94 | static KTYPE(bch_stats); | ||
95 | |||
96 | static void scale_accounting(unsigned long data); | ||
97 | |||
98 | void bch_cache_accounting_init(struct cache_accounting *acc, | ||
99 | struct closure *parent) | ||
100 | { | ||
101 | kobject_init(&acc->total.kobj, &bch_stats_ktype); | ||
102 | kobject_init(&acc->five_minute.kobj, &bch_stats_ktype); | ||
103 | kobject_init(&acc->hour.kobj, &bch_stats_ktype); | ||
104 | kobject_init(&acc->day.kobj, &bch_stats_ktype); | ||
105 | |||
106 | closure_init(&acc->cl, parent); | ||
107 | init_timer(&acc->timer); | ||
108 | acc->timer.expires = jiffies + accounting_delay; | ||
109 | acc->timer.data = (unsigned long) acc; | ||
110 | acc->timer.function = scale_accounting; | ||
111 | add_timer(&acc->timer); | ||
112 | } | ||
113 | |||
114 | int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, | ||
115 | struct kobject *parent) | ||
116 | { | ||
117 | int ret = kobject_add(&acc->total.kobj, parent, | ||
118 | "stats_total"); | ||
119 | ret = ret ?: kobject_add(&acc->five_minute.kobj, parent, | ||
120 | "stats_five_minute"); | ||
121 | ret = ret ?: kobject_add(&acc->hour.kobj, parent, | ||
122 | "stats_hour"); | ||
123 | ret = ret ?: kobject_add(&acc->day.kobj, parent, | ||
124 | "stats_day"); | ||
125 | return ret; | ||
126 | } | ||
127 | |||
128 | void bch_cache_accounting_clear(struct cache_accounting *acc) | ||
129 | { | ||
130 | memset(&acc->total.cache_hits, | ||
131 | 0, | ||
132 | sizeof(unsigned long) * 7); | ||
133 | } | ||
134 | |||
135 | void bch_cache_accounting_destroy(struct cache_accounting *acc) | ||
136 | { | ||
137 | kobject_put(&acc->total.kobj); | ||
138 | kobject_put(&acc->five_minute.kobj); | ||
139 | kobject_put(&acc->hour.kobj); | ||
140 | kobject_put(&acc->day.kobj); | ||
141 | |||
142 | atomic_set(&acc->closing, 1); | ||
143 | if (del_timer_sync(&acc->timer)) | ||
144 | closure_return(&acc->cl); | ||
145 | } | ||
146 | |||
147 | /* EWMA scaling */ | ||
148 | |||
149 | static void scale_stat(unsigned long *stat) | ||
150 | { | ||
151 | *stat = ewma_add(*stat, 0, accounting_weight, 0); | ||
152 | } | ||
153 | |||
154 | static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) | ||
155 | { | ||
156 | if (++stats->rescale == rescale_at) { | ||
157 | stats->rescale = 0; | ||
158 | scale_stat(&stats->cache_hits); | ||
159 | scale_stat(&stats->cache_misses); | ||
160 | scale_stat(&stats->cache_bypass_hits); | ||
161 | scale_stat(&stats->cache_bypass_misses); | ||
162 | scale_stat(&stats->cache_readaheads); | ||
163 | scale_stat(&stats->cache_miss_collisions); | ||
164 | scale_stat(&stats->sectors_bypassed); | ||
165 | } | ||
166 | } | ||
167 | |||
168 | static void scale_accounting(unsigned long data) | ||
169 | { | ||
170 | struct cache_accounting *acc = (struct cache_accounting *) data; | ||
171 | |||
172 | #define move_stat(name) do { \ | ||
173 | unsigned t = atomic_xchg(&acc->collector.name, 0); \ | ||
174 | t <<= 16; \ | ||
175 | acc->five_minute.name += t; \ | ||
176 | acc->hour.name += t; \ | ||
177 | acc->day.name += t; \ | ||
178 | acc->total.name += t; \ | ||
179 | } while (0) | ||
180 | |||
181 | move_stat(cache_hits); | ||
182 | move_stat(cache_misses); | ||
183 | move_stat(cache_bypass_hits); | ||
184 | move_stat(cache_bypass_misses); | ||
185 | move_stat(cache_readaheads); | ||
186 | move_stat(cache_miss_collisions); | ||
187 | move_stat(sectors_bypassed); | ||
188 | |||
189 | scale_stats(&acc->total, 0); | ||
190 | scale_stats(&acc->day, DAY_RESCALE); | ||
191 | scale_stats(&acc->hour, HOUR_RESCALE); | ||
192 | scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE); | ||
193 | |||
194 | acc->timer.expires += accounting_delay; | ||
195 | |||
196 | if (!atomic_read(&acc->closing)) | ||
197 | add_timer(&acc->timer); | ||
198 | else | ||
199 | closure_return(&acc->cl); | ||
200 | } | ||
201 | |||
202 | static void mark_cache_stats(struct cache_stat_collector *stats, | ||
203 | bool hit, bool bypass) | ||
204 | { | ||
205 | if (!bypass) | ||
206 | if (hit) | ||
207 | atomic_inc(&stats->cache_hits); | ||
208 | else | ||
209 | atomic_inc(&stats->cache_misses); | ||
210 | else | ||
211 | if (hit) | ||
212 | atomic_inc(&stats->cache_bypass_hits); | ||
213 | else | ||
214 | atomic_inc(&stats->cache_bypass_misses); | ||
215 | } | ||
216 | |||
217 | void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass) | ||
218 | { | ||
219 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
220 | mark_cache_stats(&dc->accounting.collector, hit, bypass); | ||
221 | mark_cache_stats(&s->op.c->accounting.collector, hit, bypass); | ||
222 | #ifdef CONFIG_CGROUP_BCACHE | ||
223 | mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass); | ||
224 | #endif | ||
225 | } | ||
226 | |||
227 | void bch_mark_cache_readahead(struct search *s) | ||
228 | { | ||
229 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
230 | atomic_inc(&dc->accounting.collector.cache_readaheads); | ||
231 | atomic_inc(&s->op.c->accounting.collector.cache_readaheads); | ||
232 | } | ||
233 | |||
234 | void bch_mark_cache_miss_collision(struct search *s) | ||
235 | { | ||
236 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
237 | atomic_inc(&dc->accounting.collector.cache_miss_collisions); | ||
238 | atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions); | ||
239 | } | ||
240 | |||
241 | void bch_mark_sectors_bypassed(struct search *s, int sectors) | ||
242 | { | ||
243 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
244 | atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); | ||
245 | atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed); | ||
246 | } | ||
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h new file mode 100644 index 000000000000..c7c7a8fd29fe --- /dev/null +++ b/drivers/md/bcache/stats.h | |||
@@ -0,0 +1,58 @@ | |||
1 | #ifndef _BCACHE_STATS_H_ | ||
2 | #define _BCACHE_STATS_H_ | ||
3 | |||
4 | struct cache_stat_collector { | ||
5 | atomic_t cache_hits; | ||
6 | atomic_t cache_misses; | ||
7 | atomic_t cache_bypass_hits; | ||
8 | atomic_t cache_bypass_misses; | ||
9 | atomic_t cache_readaheads; | ||
10 | atomic_t cache_miss_collisions; | ||
11 | atomic_t sectors_bypassed; | ||
12 | }; | ||
13 | |||
14 | struct cache_stats { | ||
15 | struct kobject kobj; | ||
16 | |||
17 | unsigned long cache_hits; | ||
18 | unsigned long cache_misses; | ||
19 | unsigned long cache_bypass_hits; | ||
20 | unsigned long cache_bypass_misses; | ||
21 | unsigned long cache_readaheads; | ||
22 | unsigned long cache_miss_collisions; | ||
23 | unsigned long sectors_bypassed; | ||
24 | |||
25 | unsigned rescale; | ||
26 | }; | ||
27 | |||
28 | struct cache_accounting { | ||
29 | struct closure cl; | ||
30 | struct timer_list timer; | ||
31 | atomic_t closing; | ||
32 | |||
33 | struct cache_stat_collector collector; | ||
34 | |||
35 | struct cache_stats total; | ||
36 | struct cache_stats five_minute; | ||
37 | struct cache_stats hour; | ||
38 | struct cache_stats day; | ||
39 | }; | ||
40 | |||
41 | struct search; | ||
42 | |||
43 | void bch_cache_accounting_init(struct cache_accounting *acc, | ||
44 | struct closure *parent); | ||
45 | |||
46 | int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, | ||
47 | struct kobject *parent); | ||
48 | |||
49 | void bch_cache_accounting_clear(struct cache_accounting *acc); | ||
50 | |||
51 | void bch_cache_accounting_destroy(struct cache_accounting *acc); | ||
52 | |||
53 | void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass); | ||
54 | void bch_mark_cache_readahead(struct search *s); | ||
55 | void bch_mark_cache_miss_collision(struct search *s); | ||
56 | void bch_mark_sectors_bypassed(struct search *s, int sectors); | ||
57 | |||
58 | #endif /* _BCACHE_STATS_H_ */ | ||
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c new file mode 100644 index 000000000000..c8046bc4aa57 --- /dev/null +++ b/drivers/md/bcache/super.c | |||
@@ -0,0 +1,1987 @@ | |||
1 | /* | ||
2 | * bcache setup/teardown code, and some metadata io - read a superblock and | ||
3 | * figure out what to do with it. | ||
4 | * | ||
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
6 | * Copyright 2012 Google, Inc. | ||
7 | */ | ||
8 | |||
9 | #include "bcache.h" | ||
10 | #include "btree.h" | ||
11 | #include "debug.h" | ||
12 | #include "request.h" | ||
13 | |||
14 | #include <linux/buffer_head.h> | ||
15 | #include <linux/debugfs.h> | ||
16 | #include <linux/genhd.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/random.h> | ||
19 | #include <linux/reboot.h> | ||
20 | #include <linux/sysfs.h> | ||
21 | |||
22 | MODULE_LICENSE("GPL"); | ||
23 | MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); | ||
24 | |||
25 | static const char bcache_magic[] = { | ||
26 | 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca, | ||
27 | 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81 | ||
28 | }; | ||
29 | |||
30 | static const char invalid_uuid[] = { | ||
31 | 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78, | ||
32 | 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99 | ||
33 | }; | ||
34 | |||
35 | /* Default is -1; we skip past it for struct cached_dev's cache mode */ | ||
36 | const char * const bch_cache_modes[] = { | ||
37 | "default", | ||
38 | "writethrough", | ||
39 | "writeback", | ||
40 | "writearound", | ||
41 | "none", | ||
42 | NULL | ||
43 | }; | ||
44 | |||
45 | struct uuid_entry_v0 { | ||
46 | uint8_t uuid[16]; | ||
47 | uint8_t label[32]; | ||
48 | uint32_t first_reg; | ||
49 | uint32_t last_reg; | ||
50 | uint32_t invalidated; | ||
51 | uint32_t pad; | ||
52 | }; | ||
53 | |||
54 | static struct kobject *bcache_kobj; | ||
55 | struct mutex bch_register_lock; | ||
56 | LIST_HEAD(bch_cache_sets); | ||
57 | static LIST_HEAD(uncached_devices); | ||
58 | |||
59 | static int bcache_major, bcache_minor; | ||
60 | static wait_queue_head_t unregister_wait; | ||
61 | struct workqueue_struct *bcache_wq; | ||
62 | |||
63 | #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) | ||
64 | |||
65 | static void bio_split_pool_free(struct bio_split_pool *p) | ||
66 | { | ||
67 | if (p->bio_split_hook) | ||
68 | mempool_destroy(p->bio_split_hook); | ||
69 | |||
70 | if (p->bio_split) | ||
71 | bioset_free(p->bio_split); | ||
72 | } | ||
73 | |||
74 | static int bio_split_pool_init(struct bio_split_pool *p) | ||
75 | { | ||
76 | p->bio_split = bioset_create(4, 0); | ||
77 | if (!p->bio_split) | ||
78 | return -ENOMEM; | ||
79 | |||
80 | p->bio_split_hook = mempool_create_kmalloc_pool(4, | ||
81 | sizeof(struct bio_split_hook)); | ||
82 | if (!p->bio_split_hook) | ||
83 | return -ENOMEM; | ||
84 | |||
85 | return 0; | ||
86 | } | ||
87 | |||
88 | /* Superblock */ | ||
89 | |||
90 | static const char *read_super(struct cache_sb *sb, struct block_device *bdev, | ||
91 | struct page **res) | ||
92 | { | ||
93 | const char *err; | ||
94 | struct cache_sb *s; | ||
95 | struct buffer_head *bh = __bread(bdev, 1, SB_SIZE); | ||
96 | unsigned i; | ||
97 | |||
98 | if (!bh) | ||
99 | return "IO error"; | ||
100 | |||
101 | s = (struct cache_sb *) bh->b_data; | ||
102 | |||
103 | sb->offset = le64_to_cpu(s->offset); | ||
104 | sb->version = le64_to_cpu(s->version); | ||
105 | |||
106 | memcpy(sb->magic, s->magic, 16); | ||
107 | memcpy(sb->uuid, s->uuid, 16); | ||
108 | memcpy(sb->set_uuid, s->set_uuid, 16); | ||
109 | memcpy(sb->label, s->label, SB_LABEL_SIZE); | ||
110 | |||
111 | sb->flags = le64_to_cpu(s->flags); | ||
112 | sb->seq = le64_to_cpu(s->seq); | ||
113 | sb->last_mount = le32_to_cpu(s->last_mount); | ||
114 | sb->first_bucket = le16_to_cpu(s->first_bucket); | ||
115 | sb->keys = le16_to_cpu(s->keys); | ||
116 | |||
117 | for (i = 0; i < SB_JOURNAL_BUCKETS; i++) | ||
118 | sb->d[i] = le64_to_cpu(s->d[i]); | ||
119 | |||
120 | pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", | ||
121 | sb->version, sb->flags, sb->seq, sb->keys); | ||
122 | |||
123 | err = "Not a bcache superblock"; | ||
124 | if (sb->offset != SB_SECTOR) | ||
125 | goto err; | ||
126 | |||
127 | if (memcmp(sb->magic, bcache_magic, 16)) | ||
128 | goto err; | ||
129 | |||
130 | err = "Too many journal buckets"; | ||
131 | if (sb->keys > SB_JOURNAL_BUCKETS) | ||
132 | goto err; | ||
133 | |||
134 | err = "Bad checksum"; | ||
135 | if (s->csum != csum_set(s)) | ||
136 | goto err; | ||
137 | |||
138 | err = "Bad UUID"; | ||
139 | if (bch_is_zero(sb->uuid, 16)) | ||
140 | goto err; | ||
141 | |||
142 | sb->block_size = le16_to_cpu(s->block_size); | ||
143 | |||
144 | err = "Superblock block size smaller than device block size"; | ||
145 | if (sb->block_size << 9 < bdev_logical_block_size(bdev)) | ||
146 | goto err; | ||
147 | |||
148 | switch (sb->version) { | ||
149 | case BCACHE_SB_VERSION_BDEV: | ||
150 | sb->data_offset = BDEV_DATA_START_DEFAULT; | ||
151 | break; | ||
152 | case BCACHE_SB_VERSION_BDEV_WITH_OFFSET: | ||
153 | sb->data_offset = le64_to_cpu(s->data_offset); | ||
154 | |||
155 | err = "Bad data offset"; | ||
156 | if (sb->data_offset < BDEV_DATA_START_DEFAULT) | ||
157 | goto err; | ||
158 | |||
159 | break; | ||
160 | case BCACHE_SB_VERSION_CDEV: | ||
161 | case BCACHE_SB_VERSION_CDEV_WITH_UUID: | ||
162 | sb->nbuckets = le64_to_cpu(s->nbuckets); | ||
163 | sb->block_size = le16_to_cpu(s->block_size); | ||
164 | sb->bucket_size = le16_to_cpu(s->bucket_size); | ||
165 | |||
166 | sb->nr_in_set = le16_to_cpu(s->nr_in_set); | ||
167 | sb->nr_this_dev = le16_to_cpu(s->nr_this_dev); | ||
168 | |||
169 | err = "Too many buckets"; | ||
170 | if (sb->nbuckets > LONG_MAX) | ||
171 | goto err; | ||
172 | |||
173 | err = "Not enough buckets"; | ||
174 | if (sb->nbuckets < 1 << 7) | ||
175 | goto err; | ||
176 | |||
177 | err = "Bad block/bucket size"; | ||
178 | if (!is_power_of_2(sb->block_size) || | ||
179 | sb->block_size > PAGE_SECTORS || | ||
180 | !is_power_of_2(sb->bucket_size) || | ||
181 | sb->bucket_size < PAGE_SECTORS) | ||
182 | goto err; | ||
183 | |||
184 | err = "Invalid superblock: device too small"; | ||
185 | if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets) | ||
186 | goto err; | ||
187 | |||
188 | err = "Bad UUID"; | ||
189 | if (bch_is_zero(sb->set_uuid, 16)) | ||
190 | goto err; | ||
191 | |||
192 | err = "Bad cache device number in set"; | ||
193 | if (!sb->nr_in_set || | ||
194 | sb->nr_in_set <= sb->nr_this_dev || | ||
195 | sb->nr_in_set > MAX_CACHES_PER_SET) | ||
196 | goto err; | ||
197 | |||
198 | err = "Journal buckets not sequential"; | ||
199 | for (i = 0; i < sb->keys; i++) | ||
200 | if (sb->d[i] != sb->first_bucket + i) | ||
201 | goto err; | ||
202 | |||
203 | err = "Too many journal buckets"; | ||
204 | if (sb->first_bucket + sb->keys > sb->nbuckets) | ||
205 | goto err; | ||
206 | |||
207 | err = "Invalid superblock: first bucket comes before end of super"; | ||
208 | if (sb->first_bucket * sb->bucket_size < 16) | ||
209 | goto err; | ||
210 | |||
211 | break; | ||
212 | default: | ||
213 | err = "Unsupported superblock version"; | ||
214 | goto err; | ||
215 | } | ||
216 | |||
217 | sb->last_mount = get_seconds(); | ||
218 | err = NULL; | ||
219 | |||
220 | get_page(bh->b_page); | ||
221 | *res = bh->b_page; | ||
222 | err: | ||
223 | put_bh(bh); | ||
224 | return err; | ||
225 | } | ||
226 | |||
227 | static void write_bdev_super_endio(struct bio *bio, int error) | ||
228 | { | ||
229 | struct cached_dev *dc = bio->bi_private; | ||
230 | /* XXX: error checking */ | ||
231 | |||
232 | closure_put(&dc->sb_write.cl); | ||
233 | } | ||
234 | |||
235 | static void __write_super(struct cache_sb *sb, struct bio *bio) | ||
236 | { | ||
237 | struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page); | ||
238 | unsigned i; | ||
239 | |||
240 | bio->bi_sector = SB_SECTOR; | ||
241 | bio->bi_rw = REQ_SYNC|REQ_META; | ||
242 | bio->bi_size = SB_SIZE; | ||
243 | bch_bio_map(bio, NULL); | ||
244 | |||
245 | out->offset = cpu_to_le64(sb->offset); | ||
246 | out->version = cpu_to_le64(sb->version); | ||
247 | |||
248 | memcpy(out->uuid, sb->uuid, 16); | ||
249 | memcpy(out->set_uuid, sb->set_uuid, 16); | ||
250 | memcpy(out->label, sb->label, SB_LABEL_SIZE); | ||
251 | |||
252 | out->flags = cpu_to_le64(sb->flags); | ||
253 | out->seq = cpu_to_le64(sb->seq); | ||
254 | |||
255 | out->last_mount = cpu_to_le32(sb->last_mount); | ||
256 | out->first_bucket = cpu_to_le16(sb->first_bucket); | ||
257 | out->keys = cpu_to_le16(sb->keys); | ||
258 | |||
259 | for (i = 0; i < sb->keys; i++) | ||
260 | out->d[i] = cpu_to_le64(sb->d[i]); | ||
261 | |||
262 | out->csum = csum_set(out); | ||
263 | |||
264 | pr_debug("ver %llu, flags %llu, seq %llu", | ||
265 | sb->version, sb->flags, sb->seq); | ||
266 | |||
267 | submit_bio(REQ_WRITE, bio); | ||
268 | } | ||
269 | |||
270 | void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) | ||
271 | { | ||
272 | struct closure *cl = &dc->sb_write.cl; | ||
273 | struct bio *bio = &dc->sb_bio; | ||
274 | |||
275 | closure_lock(&dc->sb_write, parent); | ||
276 | |||
277 | bio_reset(bio); | ||
278 | bio->bi_bdev = dc->bdev; | ||
279 | bio->bi_end_io = write_bdev_super_endio; | ||
280 | bio->bi_private = dc; | ||
281 | |||
282 | closure_get(cl); | ||
283 | __write_super(&dc->sb, bio); | ||
284 | |||
285 | closure_return(cl); | ||
286 | } | ||
287 | |||
288 | static void write_super_endio(struct bio *bio, int error) | ||
289 | { | ||
290 | struct cache *ca = bio->bi_private; | ||
291 | |||
292 | bch_count_io_errors(ca, error, "writing superblock"); | ||
293 | closure_put(&ca->set->sb_write.cl); | ||
294 | } | ||
295 | |||
296 | void bcache_write_super(struct cache_set *c) | ||
297 | { | ||
298 | struct closure *cl = &c->sb_write.cl; | ||
299 | struct cache *ca; | ||
300 | unsigned i; | ||
301 | |||
302 | closure_lock(&c->sb_write, &c->cl); | ||
303 | |||
304 | c->sb.seq++; | ||
305 | |||
306 | for_each_cache(ca, c, i) { | ||
307 | struct bio *bio = &ca->sb_bio; | ||
308 | |||
309 | ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID; | ||
310 | ca->sb.seq = c->sb.seq; | ||
311 | ca->sb.last_mount = c->sb.last_mount; | ||
312 | |||
313 | SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb)); | ||
314 | |||
315 | bio_reset(bio); | ||
316 | bio->bi_bdev = ca->bdev; | ||
317 | bio->bi_end_io = write_super_endio; | ||
318 | bio->bi_private = ca; | ||
319 | |||
320 | closure_get(cl); | ||
321 | __write_super(&ca->sb, bio); | ||
322 | } | ||
323 | |||
324 | closure_return(cl); | ||
325 | } | ||
326 | |||
327 | /* UUID io */ | ||
328 | |||
329 | static void uuid_endio(struct bio *bio, int error) | ||
330 | { | ||
331 | struct closure *cl = bio->bi_private; | ||
332 | struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl); | ||
333 | |||
334 | cache_set_err_on(error, c, "accessing uuids"); | ||
335 | bch_bbio_free(bio, c); | ||
336 | closure_put(cl); | ||
337 | } | ||
338 | |||
339 | static void uuid_io(struct cache_set *c, unsigned long rw, | ||
340 | struct bkey *k, struct closure *parent) | ||
341 | { | ||
342 | struct closure *cl = &c->uuid_write.cl; | ||
343 | struct uuid_entry *u; | ||
344 | unsigned i; | ||
345 | |||
346 | BUG_ON(!parent); | ||
347 | closure_lock(&c->uuid_write, parent); | ||
348 | |||
349 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
350 | struct bio *bio = bch_bbio_alloc(c); | ||
351 | |||
352 | bio->bi_rw = REQ_SYNC|REQ_META|rw; | ||
353 | bio->bi_size = KEY_SIZE(k) << 9; | ||
354 | |||
355 | bio->bi_end_io = uuid_endio; | ||
356 | bio->bi_private = cl; | ||
357 | bch_bio_map(bio, c->uuids); | ||
358 | |||
359 | bch_submit_bbio(bio, c, k, i); | ||
360 | |||
361 | if (!(rw & WRITE)) | ||
362 | break; | ||
363 | } | ||
364 | |||
365 | pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", | ||
366 | pkey(&c->uuid_bucket)); | ||
367 | |||
368 | for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) | ||
369 | if (!bch_is_zero(u->uuid, 16)) | ||
370 | pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u", | ||
371 | u - c->uuids, u->uuid, u->label, | ||
372 | u->first_reg, u->last_reg, u->invalidated); | ||
373 | |||
374 | closure_return(cl); | ||
375 | } | ||
376 | |||
377 | static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) | ||
378 | { | ||
379 | struct bkey *k = &j->uuid_bucket; | ||
380 | |||
381 | if (__bch_ptr_invalid(c, 1, k)) | ||
382 | return "bad uuid pointer"; | ||
383 | |||
384 | bkey_copy(&c->uuid_bucket, k); | ||
385 | uuid_io(c, READ_SYNC, k, cl); | ||
386 | |||
387 | if (j->version < BCACHE_JSET_VERSION_UUIDv1) { | ||
388 | struct uuid_entry_v0 *u0 = (void *) c->uuids; | ||
389 | struct uuid_entry *u1 = (void *) c->uuids; | ||
390 | int i; | ||
391 | |||
392 | closure_sync(cl); | ||
393 | |||
394 | /* | ||
395 | * Since the new uuid entry is bigger than the old, we have to | ||
396 | * convert starting at the highest memory address and work down | ||
397 | * in order to do it in place | ||
398 | */ | ||
399 | |||
400 | for (i = c->nr_uuids - 1; | ||
401 | i >= 0; | ||
402 | --i) { | ||
403 | memcpy(u1[i].uuid, u0[i].uuid, 16); | ||
404 | memcpy(u1[i].label, u0[i].label, 32); | ||
405 | |||
406 | u1[i].first_reg = u0[i].first_reg; | ||
407 | u1[i].last_reg = u0[i].last_reg; | ||
408 | u1[i].invalidated = u0[i].invalidated; | ||
409 | |||
410 | u1[i].flags = 0; | ||
411 | u1[i].sectors = 0; | ||
412 | } | ||
413 | } | ||
414 | |||
415 | return NULL; | ||
416 | } | ||
417 | |||
418 | static int __uuid_write(struct cache_set *c) | ||
419 | { | ||
420 | BKEY_PADDED(key) k; | ||
421 | struct closure cl; | ||
422 | closure_init_stack(&cl); | ||
423 | |||
424 | lockdep_assert_held(&bch_register_lock); | ||
425 | |||
426 | if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl)) | ||
427 | return 1; | ||
428 | |||
429 | SET_KEY_SIZE(&k.key, c->sb.bucket_size); | ||
430 | uuid_io(c, REQ_WRITE, &k.key, &cl); | ||
431 | closure_sync(&cl); | ||
432 | |||
433 | bkey_copy(&c->uuid_bucket, &k.key); | ||
434 | __bkey_put(c, &k.key); | ||
435 | return 0; | ||
436 | } | ||
437 | |||
438 | int bch_uuid_write(struct cache_set *c) | ||
439 | { | ||
440 | int ret = __uuid_write(c); | ||
441 | |||
442 | if (!ret) | ||
443 | bch_journal_meta(c, NULL); | ||
444 | |||
445 | return ret; | ||
446 | } | ||
447 | |||
448 | static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid) | ||
449 | { | ||
450 | struct uuid_entry *u; | ||
451 | |||
452 | for (u = c->uuids; | ||
453 | u < c->uuids + c->nr_uuids; u++) | ||
454 | if (!memcmp(u->uuid, uuid, 16)) | ||
455 | return u; | ||
456 | |||
457 | return NULL; | ||
458 | } | ||
459 | |||
460 | static struct uuid_entry *uuid_find_empty(struct cache_set *c) | ||
461 | { | ||
462 | static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; | ||
463 | return uuid_find(c, zero_uuid); | ||
464 | } | ||
465 | |||
466 | /* | ||
467 | * Bucket priorities/gens: | ||
468 | * | ||
469 | * For each bucket, we store on disk its | ||
470 | * 8 bit gen | ||
471 | * 16 bit priority | ||
472 | * | ||
473 | * See alloc.c for an explanation of the gen. The priority is used to implement | ||
474 | * lru (and in the future other) cache replacement policies; for most purposes | ||
475 | * it's just an opaque integer. | ||
476 | * | ||
477 | * The gens and the priorities don't have a whole lot to do with each other, and | ||
478 | * it's actually the gens that must be written out at specific times - it's no | ||
479 | * big deal if the priorities don't get written, if we lose them we just reuse | ||
480 | * buckets in suboptimal order. | ||
481 | * | ||
482 | * On disk they're stored in a packed array, and in as many buckets are required | ||
483 | * to fit them all. The buckets we use to store them form a list; the journal | ||
484 | * header points to the first bucket, the first bucket points to the second | ||
485 | * bucket, et cetera. | ||
486 | * | ||
487 | * This code is used by the allocation code; periodically (whenever it runs out | ||
488 | * of buckets to allocate from) the allocation code will invalidate some | ||
489 | * buckets, but it can't use those buckets until their new gens are safely on | ||
490 | * disk. | ||
491 | */ | ||
492 | |||
493 | static void prio_endio(struct bio *bio, int error) | ||
494 | { | ||
495 | struct cache *ca = bio->bi_private; | ||
496 | |||
497 | cache_set_err_on(error, ca->set, "accessing priorities"); | ||
498 | bch_bbio_free(bio, ca->set); | ||
499 | closure_put(&ca->prio); | ||
500 | } | ||
501 | |||
502 | static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw) | ||
503 | { | ||
504 | struct closure *cl = &ca->prio; | ||
505 | struct bio *bio = bch_bbio_alloc(ca->set); | ||
506 | |||
507 | closure_init_stack(cl); | ||
508 | |||
509 | bio->bi_sector = bucket * ca->sb.bucket_size; | ||
510 | bio->bi_bdev = ca->bdev; | ||
511 | bio->bi_rw = REQ_SYNC|REQ_META|rw; | ||
512 | bio->bi_size = bucket_bytes(ca); | ||
513 | |||
514 | bio->bi_end_io = prio_endio; | ||
515 | bio->bi_private = ca; | ||
516 | bch_bio_map(bio, ca->disk_buckets); | ||
517 | |||
518 | closure_bio_submit(bio, &ca->prio, ca); | ||
519 | closure_sync(cl); | ||
520 | } | ||
521 | |||
522 | #define buckets_free(c) "free %zu, free_inc %zu, unused %zu", \ | ||
523 | fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused) | ||
524 | |||
525 | void bch_prio_write(struct cache *ca) | ||
526 | { | ||
527 | int i; | ||
528 | struct bucket *b; | ||
529 | struct closure cl; | ||
530 | |||
531 | closure_init_stack(&cl); | ||
532 | |||
533 | lockdep_assert_held(&ca->set->bucket_lock); | ||
534 | |||
535 | for (b = ca->buckets; | ||
536 | b < ca->buckets + ca->sb.nbuckets; b++) | ||
537 | b->disk_gen = b->gen; | ||
538 | |||
539 | ca->disk_buckets->seq++; | ||
540 | |||
541 | atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), | ||
542 | &ca->meta_sectors_written); | ||
543 | |||
544 | pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), | ||
545 | fifo_used(&ca->free_inc), fifo_used(&ca->unused)); | ||
546 | blktrace_msg(ca, "Starting priorities: " buckets_free(ca)); | ||
547 | |||
548 | for (i = prio_buckets(ca) - 1; i >= 0; --i) { | ||
549 | long bucket; | ||
550 | struct prio_set *p = ca->disk_buckets; | ||
551 | struct bucket_disk *d = p->data; | ||
552 | struct bucket_disk *end = d + prios_per_bucket(ca); | ||
553 | |||
554 | for (b = ca->buckets + i * prios_per_bucket(ca); | ||
555 | b < ca->buckets + ca->sb.nbuckets && d < end; | ||
556 | b++, d++) { | ||
557 | d->prio = cpu_to_le16(b->prio); | ||
558 | d->gen = b->gen; | ||
559 | } | ||
560 | |||
561 | p->next_bucket = ca->prio_buckets[i + 1]; | ||
562 | p->magic = pset_magic(ca); | ||
563 | p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); | ||
564 | |||
565 | bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl); | ||
566 | BUG_ON(bucket == -1); | ||
567 | |||
568 | mutex_unlock(&ca->set->bucket_lock); | ||
569 | prio_io(ca, bucket, REQ_WRITE); | ||
570 | mutex_lock(&ca->set->bucket_lock); | ||
571 | |||
572 | ca->prio_buckets[i] = bucket; | ||
573 | atomic_dec_bug(&ca->buckets[bucket].pin); | ||
574 | } | ||
575 | |||
576 | mutex_unlock(&ca->set->bucket_lock); | ||
577 | |||
578 | bch_journal_meta(ca->set, &cl); | ||
579 | closure_sync(&cl); | ||
580 | |||
581 | mutex_lock(&ca->set->bucket_lock); | ||
582 | |||
583 | ca->need_save_prio = 0; | ||
584 | |||
585 | /* | ||
586 | * Don't want the old priorities to get garbage collected until after we | ||
587 | * finish writing the new ones, and they're journalled | ||
588 | */ | ||
589 | for (i = 0; i < prio_buckets(ca); i++) | ||
590 | ca->prio_last_buckets[i] = ca->prio_buckets[i]; | ||
591 | } | ||
592 | |||
593 | static void prio_read(struct cache *ca, uint64_t bucket) | ||
594 | { | ||
595 | struct prio_set *p = ca->disk_buckets; | ||
596 | struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; | ||
597 | struct bucket *b; | ||
598 | unsigned bucket_nr = 0; | ||
599 | |||
600 | for (b = ca->buckets; | ||
601 | b < ca->buckets + ca->sb.nbuckets; | ||
602 | b++, d++) { | ||
603 | if (d == end) { | ||
604 | ca->prio_buckets[bucket_nr] = bucket; | ||
605 | ca->prio_last_buckets[bucket_nr] = bucket; | ||
606 | bucket_nr++; | ||
607 | |||
608 | prio_io(ca, bucket, READ_SYNC); | ||
609 | |||
610 | if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) | ||
611 | pr_warn("bad csum reading priorities"); | ||
612 | |||
613 | if (p->magic != pset_magic(ca)) | ||
614 | pr_warn("bad magic reading priorities"); | ||
615 | |||
616 | bucket = p->next_bucket; | ||
617 | d = p->data; | ||
618 | } | ||
619 | |||
620 | b->prio = le16_to_cpu(d->prio); | ||
621 | b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen; | ||
622 | } | ||
623 | } | ||
624 | |||
625 | /* Bcache device */ | ||
626 | |||
627 | static int open_dev(struct block_device *b, fmode_t mode) | ||
628 | { | ||
629 | struct bcache_device *d = b->bd_disk->private_data; | ||
630 | if (atomic_read(&d->closing)) | ||
631 | return -ENXIO; | ||
632 | |||
633 | closure_get(&d->cl); | ||
634 | return 0; | ||
635 | } | ||
636 | |||
637 | static int release_dev(struct gendisk *b, fmode_t mode) | ||
638 | { | ||
639 | struct bcache_device *d = b->private_data; | ||
640 | closure_put(&d->cl); | ||
641 | return 0; | ||
642 | } | ||
643 | |||
644 | static int ioctl_dev(struct block_device *b, fmode_t mode, | ||
645 | unsigned int cmd, unsigned long arg) | ||
646 | { | ||
647 | struct bcache_device *d = b->bd_disk->private_data; | ||
648 | return d->ioctl(d, mode, cmd, arg); | ||
649 | } | ||
650 | |||
651 | static const struct block_device_operations bcache_ops = { | ||
652 | .open = open_dev, | ||
653 | .release = release_dev, | ||
654 | .ioctl = ioctl_dev, | ||
655 | .owner = THIS_MODULE, | ||
656 | }; | ||
657 | |||
658 | void bcache_device_stop(struct bcache_device *d) | ||
659 | { | ||
660 | if (!atomic_xchg(&d->closing, 1)) | ||
661 | closure_queue(&d->cl); | ||
662 | } | ||
663 | |||
664 | static void bcache_device_unlink(struct bcache_device *d) | ||
665 | { | ||
666 | unsigned i; | ||
667 | struct cache *ca; | ||
668 | |||
669 | sysfs_remove_link(&d->c->kobj, d->name); | ||
670 | sysfs_remove_link(&d->kobj, "cache"); | ||
671 | |||
672 | for_each_cache(ca, d->c, i) | ||
673 | bd_unlink_disk_holder(ca->bdev, d->disk); | ||
674 | } | ||
675 | |||
676 | static void bcache_device_link(struct bcache_device *d, struct cache_set *c, | ||
677 | const char *name) | ||
678 | { | ||
679 | unsigned i; | ||
680 | struct cache *ca; | ||
681 | |||
682 | for_each_cache(ca, d->c, i) | ||
683 | bd_link_disk_holder(ca->bdev, d->disk); | ||
684 | |||
685 | snprintf(d->name, BCACHEDEVNAME_SIZE, | ||
686 | "%s%u", name, d->id); | ||
687 | |||
688 | WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || | ||
689 | sysfs_create_link(&c->kobj, &d->kobj, d->name), | ||
690 | "Couldn't create device <-> cache set symlinks"); | ||
691 | } | ||
692 | |||
693 | static void bcache_device_detach(struct bcache_device *d) | ||
694 | { | ||
695 | lockdep_assert_held(&bch_register_lock); | ||
696 | |||
697 | if (atomic_read(&d->detaching)) { | ||
698 | struct uuid_entry *u = d->c->uuids + d->id; | ||
699 | |||
700 | SET_UUID_FLASH_ONLY(u, 0); | ||
701 | memcpy(u->uuid, invalid_uuid, 16); | ||
702 | u->invalidated = cpu_to_le32(get_seconds()); | ||
703 | bch_uuid_write(d->c); | ||
704 | |||
705 | atomic_set(&d->detaching, 0); | ||
706 | } | ||
707 | |||
708 | bcache_device_unlink(d); | ||
709 | |||
710 | d->c->devices[d->id] = NULL; | ||
711 | closure_put(&d->c->caching); | ||
712 | d->c = NULL; | ||
713 | } | ||
714 | |||
715 | static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, | ||
716 | unsigned id) | ||
717 | { | ||
718 | BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags)); | ||
719 | |||
720 | d->id = id; | ||
721 | d->c = c; | ||
722 | c->devices[id] = d; | ||
723 | |||
724 | closure_get(&c->caching); | ||
725 | } | ||
726 | |||
727 | static void bcache_device_free(struct bcache_device *d) | ||
728 | { | ||
729 | lockdep_assert_held(&bch_register_lock); | ||
730 | |||
731 | pr_info("%s stopped", d->disk->disk_name); | ||
732 | |||
733 | if (d->c) | ||
734 | bcache_device_detach(d); | ||
735 | |||
736 | if (d->disk) | ||
737 | del_gendisk(d->disk); | ||
738 | if (d->disk && d->disk->queue) | ||
739 | blk_cleanup_queue(d->disk->queue); | ||
740 | if (d->disk) | ||
741 | put_disk(d->disk); | ||
742 | |||
743 | bio_split_pool_free(&d->bio_split_hook); | ||
744 | if (d->unaligned_bvec) | ||
745 | mempool_destroy(d->unaligned_bvec); | ||
746 | if (d->bio_split) | ||
747 | bioset_free(d->bio_split); | ||
748 | |||
749 | closure_debug_destroy(&d->cl); | ||
750 | } | ||
751 | |||
752 | static int bcache_device_init(struct bcache_device *d, unsigned block_size) | ||
753 | { | ||
754 | struct request_queue *q; | ||
755 | |||
756 | if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || | ||
757 | !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, | ||
758 | sizeof(struct bio_vec) * BIO_MAX_PAGES)) || | ||
759 | bio_split_pool_init(&d->bio_split_hook)) | ||
760 | |||
761 | return -ENOMEM; | ||
762 | |||
763 | d->disk = alloc_disk(1); | ||
764 | if (!d->disk) | ||
765 | return -ENOMEM; | ||
766 | |||
767 | snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); | ||
768 | |||
769 | d->disk->major = bcache_major; | ||
770 | d->disk->first_minor = bcache_minor++; | ||
771 | d->disk->fops = &bcache_ops; | ||
772 | d->disk->private_data = d; | ||
773 | |||
774 | q = blk_alloc_queue(GFP_KERNEL); | ||
775 | if (!q) | ||
776 | return -ENOMEM; | ||
777 | |||
778 | blk_queue_make_request(q, NULL); | ||
779 | d->disk->queue = q; | ||
780 | q->queuedata = d; | ||
781 | q->backing_dev_info.congested_data = d; | ||
782 | q->limits.max_hw_sectors = UINT_MAX; | ||
783 | q->limits.max_sectors = UINT_MAX; | ||
784 | q->limits.max_segment_size = UINT_MAX; | ||
785 | q->limits.max_segments = BIO_MAX_PAGES; | ||
786 | q->limits.max_discard_sectors = UINT_MAX; | ||
787 | q->limits.io_min = block_size; | ||
788 | q->limits.logical_block_size = block_size; | ||
789 | q->limits.physical_block_size = block_size; | ||
790 | set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); | ||
791 | set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); | ||
792 | |||
793 | return 0; | ||
794 | } | ||
795 | |||
796 | /* Cached device */ | ||
797 | |||
798 | static void calc_cached_dev_sectors(struct cache_set *c) | ||
799 | { | ||
800 | uint64_t sectors = 0; | ||
801 | struct cached_dev *dc; | ||
802 | |||
803 | list_for_each_entry(dc, &c->cached_devs, list) | ||
804 | sectors += bdev_sectors(dc->bdev); | ||
805 | |||
806 | c->cached_dev_sectors = sectors; | ||
807 | } | ||
808 | |||
809 | void bch_cached_dev_run(struct cached_dev *dc) | ||
810 | { | ||
811 | struct bcache_device *d = &dc->disk; | ||
812 | |||
813 | if (atomic_xchg(&dc->running, 1)) | ||
814 | return; | ||
815 | |||
816 | if (!d->c && | ||
817 | BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) { | ||
818 | struct closure cl; | ||
819 | closure_init_stack(&cl); | ||
820 | |||
821 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE); | ||
822 | bch_write_bdev_super(dc, &cl); | ||
823 | closure_sync(&cl); | ||
824 | } | ||
825 | |||
826 | add_disk(d->disk); | ||
827 | bd_link_disk_holder(dc->bdev, dc->disk.disk); | ||
828 | #if 0 | ||
829 | char *env[] = { "SYMLINK=label" , NULL }; | ||
830 | kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); | ||
831 | #endif | ||
832 | if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || | ||
833 | sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) | ||
834 | pr_debug("error creating sysfs link"); | ||
835 | } | ||
836 | |||
837 | static void cached_dev_detach_finish(struct work_struct *w) | ||
838 | { | ||
839 | struct cached_dev *dc = container_of(w, struct cached_dev, detach); | ||
840 | char buf[BDEVNAME_SIZE]; | ||
841 | struct closure cl; | ||
842 | closure_init_stack(&cl); | ||
843 | |||
844 | BUG_ON(!atomic_read(&dc->disk.detaching)); | ||
845 | BUG_ON(atomic_read(&dc->count)); | ||
846 | |||
847 | mutex_lock(&bch_register_lock); | ||
848 | |||
849 | memset(&dc->sb.set_uuid, 0, 16); | ||
850 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); | ||
851 | |||
852 | bch_write_bdev_super(dc, &cl); | ||
853 | closure_sync(&cl); | ||
854 | |||
855 | bcache_device_detach(&dc->disk); | ||
856 | list_move(&dc->list, &uncached_devices); | ||
857 | |||
858 | mutex_unlock(&bch_register_lock); | ||
859 | |||
860 | pr_info("Caching disabled for %s", bdevname(dc->bdev, buf)); | ||
861 | |||
862 | /* Drop ref we took in cached_dev_detach() */ | ||
863 | closure_put(&dc->disk.cl); | ||
864 | } | ||
865 | |||
866 | void bch_cached_dev_detach(struct cached_dev *dc) | ||
867 | { | ||
868 | lockdep_assert_held(&bch_register_lock); | ||
869 | |||
870 | if (atomic_read(&dc->disk.closing)) | ||
871 | return; | ||
872 | |||
873 | if (atomic_xchg(&dc->disk.detaching, 1)) | ||
874 | return; | ||
875 | |||
876 | /* | ||
877 | * Block the device from being closed and freed until we're finished | ||
878 | * detaching | ||
879 | */ | ||
880 | closure_get(&dc->disk.cl); | ||
881 | |||
882 | bch_writeback_queue(dc); | ||
883 | cached_dev_put(dc); | ||
884 | } | ||
885 | |||
886 | int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) | ||
887 | { | ||
888 | uint32_t rtime = cpu_to_le32(get_seconds()); | ||
889 | struct uuid_entry *u; | ||
890 | char buf[BDEVNAME_SIZE]; | ||
891 | |||
892 | bdevname(dc->bdev, buf); | ||
893 | |||
894 | if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)) | ||
895 | return -ENOENT; | ||
896 | |||
897 | if (dc->disk.c) { | ||
898 | pr_err("Can't attach %s: already attached", buf); | ||
899 | return -EINVAL; | ||
900 | } | ||
901 | |||
902 | if (test_bit(CACHE_SET_STOPPING, &c->flags)) { | ||
903 | pr_err("Can't attach %s: shutting down", buf); | ||
904 | return -EINVAL; | ||
905 | } | ||
906 | |||
907 | if (dc->sb.block_size < c->sb.block_size) { | ||
908 | /* Will die */ | ||
909 | pr_err("Couldn't attach %s: block size less than set's block size", | ||
910 | buf); | ||
911 | return -EINVAL; | ||
912 | } | ||
913 | |||
914 | u = uuid_find(c, dc->sb.uuid); | ||
915 | |||
916 | if (u && | ||
917 | (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE || | ||
918 | BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) { | ||
919 | memcpy(u->uuid, invalid_uuid, 16); | ||
920 | u->invalidated = cpu_to_le32(get_seconds()); | ||
921 | u = NULL; | ||
922 | } | ||
923 | |||
924 | if (!u) { | ||
925 | if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { | ||
926 | pr_err("Couldn't find uuid for %s in set", buf); | ||
927 | return -ENOENT; | ||
928 | } | ||
929 | |||
930 | u = uuid_find_empty(c); | ||
931 | if (!u) { | ||
932 | pr_err("Not caching %s, no room for UUID", buf); | ||
933 | return -EINVAL; | ||
934 | } | ||
935 | } | ||
936 | |||
937 | /* Deadlocks since we're called via sysfs... | ||
938 | sysfs_remove_file(&dc->kobj, &sysfs_attach); | ||
939 | */ | ||
940 | |||
941 | if (bch_is_zero(u->uuid, 16)) { | ||
942 | struct closure cl; | ||
943 | closure_init_stack(&cl); | ||
944 | |||
945 | memcpy(u->uuid, dc->sb.uuid, 16); | ||
946 | memcpy(u->label, dc->sb.label, SB_LABEL_SIZE); | ||
947 | u->first_reg = u->last_reg = rtime; | ||
948 | bch_uuid_write(c); | ||
949 | |||
950 | memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16); | ||
951 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); | ||
952 | |||
953 | bch_write_bdev_super(dc, &cl); | ||
954 | closure_sync(&cl); | ||
955 | } else { | ||
956 | u->last_reg = rtime; | ||
957 | bch_uuid_write(c); | ||
958 | } | ||
959 | |||
960 | bcache_device_attach(&dc->disk, c, u - c->uuids); | ||
961 | list_move(&dc->list, &c->cached_devs); | ||
962 | calc_cached_dev_sectors(c); | ||
963 | |||
964 | smp_wmb(); | ||
965 | /* | ||
966 | * dc->c must be set before dc->count != 0 - paired with the mb in | ||
967 | * cached_dev_get() | ||
968 | */ | ||
969 | atomic_set(&dc->count, 1); | ||
970 | |||
971 | if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { | ||
972 | atomic_set(&dc->has_dirty, 1); | ||
973 | atomic_inc(&dc->count); | ||
974 | bch_writeback_queue(dc); | ||
975 | } | ||
976 | |||
977 | bch_cached_dev_run(dc); | ||
978 | bcache_device_link(&dc->disk, c, "bdev"); | ||
979 | |||
980 | pr_info("Caching %s as %s on set %pU", | ||
981 | bdevname(dc->bdev, buf), dc->disk.disk->disk_name, | ||
982 | dc->disk.c->sb.set_uuid); | ||
983 | return 0; | ||
984 | } | ||
985 | |||
986 | void bch_cached_dev_release(struct kobject *kobj) | ||
987 | { | ||
988 | struct cached_dev *dc = container_of(kobj, struct cached_dev, | ||
989 | disk.kobj); | ||
990 | kfree(dc); | ||
991 | module_put(THIS_MODULE); | ||
992 | } | ||
993 | |||
994 | static void cached_dev_free(struct closure *cl) | ||
995 | { | ||
996 | struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); | ||
997 | |||
998 | cancel_delayed_work_sync(&dc->writeback_rate_update); | ||
999 | |||
1000 | mutex_lock(&bch_register_lock); | ||
1001 | |||
1002 | bd_unlink_disk_holder(dc->bdev, dc->disk.disk); | ||
1003 | bcache_device_free(&dc->disk); | ||
1004 | list_del(&dc->list); | ||
1005 | |||
1006 | mutex_unlock(&bch_register_lock); | ||
1007 | |||
1008 | if (!IS_ERR_OR_NULL(dc->bdev)) { | ||
1009 | blk_sync_queue(bdev_get_queue(dc->bdev)); | ||
1010 | blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); | ||
1011 | } | ||
1012 | |||
1013 | wake_up(&unregister_wait); | ||
1014 | |||
1015 | kobject_put(&dc->disk.kobj); | ||
1016 | } | ||
1017 | |||
1018 | static void cached_dev_flush(struct closure *cl) | ||
1019 | { | ||
1020 | struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); | ||
1021 | struct bcache_device *d = &dc->disk; | ||
1022 | |||
1023 | bch_cache_accounting_destroy(&dc->accounting); | ||
1024 | kobject_del(&d->kobj); | ||
1025 | |||
1026 | continue_at(cl, cached_dev_free, system_wq); | ||
1027 | } | ||
1028 | |||
1029 | static int cached_dev_init(struct cached_dev *dc, unsigned block_size) | ||
1030 | { | ||
1031 | int err; | ||
1032 | struct io *io; | ||
1033 | |||
1034 | closure_init(&dc->disk.cl, NULL); | ||
1035 | set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); | ||
1036 | |||
1037 | __module_get(THIS_MODULE); | ||
1038 | INIT_LIST_HEAD(&dc->list); | ||
1039 | kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); | ||
1040 | |||
1041 | bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); | ||
1042 | |||
1043 | err = bcache_device_init(&dc->disk, block_size); | ||
1044 | if (err) | ||
1045 | goto err; | ||
1046 | |||
1047 | spin_lock_init(&dc->io_lock); | ||
1048 | closure_init_unlocked(&dc->sb_write); | ||
1049 | INIT_WORK(&dc->detach, cached_dev_detach_finish); | ||
1050 | |||
1051 | dc->sequential_merge = true; | ||
1052 | dc->sequential_cutoff = 4 << 20; | ||
1053 | |||
1054 | INIT_LIST_HEAD(&dc->io_lru); | ||
1055 | dc->sb_bio.bi_max_vecs = 1; | ||
1056 | dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs; | ||
1057 | |||
1058 | for (io = dc->io; io < dc->io + RECENT_IO; io++) { | ||
1059 | list_add(&io->lru, &dc->io_lru); | ||
1060 | hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); | ||
1061 | } | ||
1062 | |||
1063 | bch_writeback_init_cached_dev(dc); | ||
1064 | return 0; | ||
1065 | err: | ||
1066 | bcache_device_stop(&dc->disk); | ||
1067 | return err; | ||
1068 | } | ||
1069 | |||
1070 | /* Cached device - bcache superblock */ | ||
1071 | |||
1072 | static const char *register_bdev(struct cache_sb *sb, struct page *sb_page, | ||
1073 | struct block_device *bdev, | ||
1074 | struct cached_dev *dc) | ||
1075 | { | ||
1076 | char name[BDEVNAME_SIZE]; | ||
1077 | const char *err = "cannot allocate memory"; | ||
1078 | struct gendisk *g; | ||
1079 | struct cache_set *c; | ||
1080 | |||
1081 | if (!dc || cached_dev_init(dc, sb->block_size << 9) != 0) | ||
1082 | return err; | ||
1083 | |||
1084 | memcpy(&dc->sb, sb, sizeof(struct cache_sb)); | ||
1085 | dc->sb_bio.bi_io_vec[0].bv_page = sb_page; | ||
1086 | dc->bdev = bdev; | ||
1087 | dc->bdev->bd_holder = dc; | ||
1088 | |||
1089 | g = dc->disk.disk; | ||
1090 | |||
1091 | set_capacity(g, dc->bdev->bd_part->nr_sects - dc->sb.data_offset); | ||
1092 | |||
1093 | g->queue->backing_dev_info.ra_pages = | ||
1094 | max(g->queue->backing_dev_info.ra_pages, | ||
1095 | bdev->bd_queue->backing_dev_info.ra_pages); | ||
1096 | |||
1097 | bch_cached_dev_request_init(dc); | ||
1098 | |||
1099 | err = "error creating kobject"; | ||
1100 | if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj, | ||
1101 | "bcache")) | ||
1102 | goto err; | ||
1103 | if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) | ||
1104 | goto err; | ||
1105 | |||
1106 | list_add(&dc->list, &uncached_devices); | ||
1107 | list_for_each_entry(c, &bch_cache_sets, list) | ||
1108 | bch_cached_dev_attach(dc, c); | ||
1109 | |||
1110 | if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || | ||
1111 | BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) | ||
1112 | bch_cached_dev_run(dc); | ||
1113 | |||
1114 | return NULL; | ||
1115 | err: | ||
1116 | kobject_put(&dc->disk.kobj); | ||
1117 | pr_notice("error opening %s: %s", bdevname(bdev, name), err); | ||
1118 | /* | ||
1119 | * Return NULL instead of an error because kobject_put() cleans | ||
1120 | * everything up | ||
1121 | */ | ||
1122 | return NULL; | ||
1123 | } | ||
1124 | |||
1125 | /* Flash only volumes */ | ||
1126 | |||
1127 | void bch_flash_dev_release(struct kobject *kobj) | ||
1128 | { | ||
1129 | struct bcache_device *d = container_of(kobj, struct bcache_device, | ||
1130 | kobj); | ||
1131 | kfree(d); | ||
1132 | } | ||
1133 | |||
1134 | static void flash_dev_free(struct closure *cl) | ||
1135 | { | ||
1136 | struct bcache_device *d = container_of(cl, struct bcache_device, cl); | ||
1137 | bcache_device_free(d); | ||
1138 | kobject_put(&d->kobj); | ||
1139 | } | ||
1140 | |||
1141 | static void flash_dev_flush(struct closure *cl) | ||
1142 | { | ||
1143 | struct bcache_device *d = container_of(cl, struct bcache_device, cl); | ||
1144 | |||
1145 | bcache_device_unlink(d); | ||
1146 | kobject_del(&d->kobj); | ||
1147 | continue_at(cl, flash_dev_free, system_wq); | ||
1148 | } | ||
1149 | |||
1150 | static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) | ||
1151 | { | ||
1152 | struct bcache_device *d = kzalloc(sizeof(struct bcache_device), | ||
1153 | GFP_KERNEL); | ||
1154 | if (!d) | ||
1155 | return -ENOMEM; | ||
1156 | |||
1157 | closure_init(&d->cl, NULL); | ||
1158 | set_closure_fn(&d->cl, flash_dev_flush, system_wq); | ||
1159 | |||
1160 | kobject_init(&d->kobj, &bch_flash_dev_ktype); | ||
1161 | |||
1162 | if (bcache_device_init(d, block_bytes(c))) | ||
1163 | goto err; | ||
1164 | |||
1165 | bcache_device_attach(d, c, u - c->uuids); | ||
1166 | set_capacity(d->disk, u->sectors); | ||
1167 | bch_flash_dev_request_init(d); | ||
1168 | add_disk(d->disk); | ||
1169 | |||
1170 | if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache")) | ||
1171 | goto err; | ||
1172 | |||
1173 | bcache_device_link(d, c, "volume"); | ||
1174 | |||
1175 | return 0; | ||
1176 | err: | ||
1177 | kobject_put(&d->kobj); | ||
1178 | return -ENOMEM; | ||
1179 | } | ||
1180 | |||
1181 | static int flash_devs_run(struct cache_set *c) | ||
1182 | { | ||
1183 | int ret = 0; | ||
1184 | struct uuid_entry *u; | ||
1185 | |||
1186 | for (u = c->uuids; | ||
1187 | u < c->uuids + c->nr_uuids && !ret; | ||
1188 | u++) | ||
1189 | if (UUID_FLASH_ONLY(u)) | ||
1190 | ret = flash_dev_run(c, u); | ||
1191 | |||
1192 | return ret; | ||
1193 | } | ||
1194 | |||
1195 | int bch_flash_dev_create(struct cache_set *c, uint64_t size) | ||
1196 | { | ||
1197 | struct uuid_entry *u; | ||
1198 | |||
1199 | if (test_bit(CACHE_SET_STOPPING, &c->flags)) | ||
1200 | return -EINTR; | ||
1201 | |||
1202 | u = uuid_find_empty(c); | ||
1203 | if (!u) { | ||
1204 | pr_err("Can't create volume, no room for UUID"); | ||
1205 | return -EINVAL; | ||
1206 | } | ||
1207 | |||
1208 | get_random_bytes(u->uuid, 16); | ||
1209 | memset(u->label, 0, 32); | ||
1210 | u->first_reg = u->last_reg = cpu_to_le32(get_seconds()); | ||
1211 | |||
1212 | SET_UUID_FLASH_ONLY(u, 1); | ||
1213 | u->sectors = size >> 9; | ||
1214 | |||
1215 | bch_uuid_write(c); | ||
1216 | |||
1217 | return flash_dev_run(c, u); | ||
1218 | } | ||
1219 | |||
1220 | /* Cache set */ | ||
1221 | |||
1222 | __printf(2, 3) | ||
1223 | bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) | ||
1224 | { | ||
1225 | va_list args; | ||
1226 | |||
1227 | if (test_bit(CACHE_SET_STOPPING, &c->flags)) | ||
1228 | return false; | ||
1229 | |||
1230 | /* XXX: we can be called from atomic context | ||
1231 | acquire_console_sem(); | ||
1232 | */ | ||
1233 | |||
1234 | printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid); | ||
1235 | |||
1236 | va_start(args, fmt); | ||
1237 | vprintk(fmt, args); | ||
1238 | va_end(args); | ||
1239 | |||
1240 | printk(", disabling caching\n"); | ||
1241 | |||
1242 | bch_cache_set_unregister(c); | ||
1243 | return true; | ||
1244 | } | ||
1245 | |||
1246 | void bch_cache_set_release(struct kobject *kobj) | ||
1247 | { | ||
1248 | struct cache_set *c = container_of(kobj, struct cache_set, kobj); | ||
1249 | kfree(c); | ||
1250 | module_put(THIS_MODULE); | ||
1251 | } | ||
1252 | |||
1253 | static void cache_set_free(struct closure *cl) | ||
1254 | { | ||
1255 | struct cache_set *c = container_of(cl, struct cache_set, cl); | ||
1256 | struct cache *ca; | ||
1257 | unsigned i; | ||
1258 | |||
1259 | if (!IS_ERR_OR_NULL(c->debug)) | ||
1260 | debugfs_remove(c->debug); | ||
1261 | |||
1262 | bch_open_buckets_free(c); | ||
1263 | bch_btree_cache_free(c); | ||
1264 | bch_journal_free(c); | ||
1265 | |||
1266 | for_each_cache(ca, c, i) | ||
1267 | if (ca) | ||
1268 | kobject_put(&ca->kobj); | ||
1269 | |||
1270 | free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); | ||
1271 | free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); | ||
1272 | |||
1273 | kfree(c->fill_iter); | ||
1274 | if (c->bio_split) | ||
1275 | bioset_free(c->bio_split); | ||
1276 | if (c->bio_meta) | ||
1277 | mempool_destroy(c->bio_meta); | ||
1278 | if (c->search) | ||
1279 | mempool_destroy(c->search); | ||
1280 | kfree(c->devices); | ||
1281 | |||
1282 | mutex_lock(&bch_register_lock); | ||
1283 | list_del(&c->list); | ||
1284 | mutex_unlock(&bch_register_lock); | ||
1285 | |||
1286 | pr_info("Cache set %pU unregistered", c->sb.set_uuid); | ||
1287 | wake_up(&unregister_wait); | ||
1288 | |||
1289 | closure_debug_destroy(&c->cl); | ||
1290 | kobject_put(&c->kobj); | ||
1291 | } | ||
1292 | |||
1293 | static void cache_set_flush(struct closure *cl) | ||
1294 | { | ||
1295 | struct cache_set *c = container_of(cl, struct cache_set, caching); | ||
1296 | struct btree *b; | ||
1297 | |||
1298 | /* Shut down allocator threads */ | ||
1299 | set_bit(CACHE_SET_STOPPING_2, &c->flags); | ||
1300 | wake_up(&c->alloc_wait); | ||
1301 | |||
1302 | bch_cache_accounting_destroy(&c->accounting); | ||
1303 | |||
1304 | kobject_put(&c->internal); | ||
1305 | kobject_del(&c->kobj); | ||
1306 | |||
1307 | if (!IS_ERR_OR_NULL(c->root)) | ||
1308 | list_add(&c->root->list, &c->btree_cache); | ||
1309 | |||
1310 | /* Should skip this if we're unregistering because of an error */ | ||
1311 | list_for_each_entry(b, &c->btree_cache, list) | ||
1312 | if (btree_node_dirty(b)) | ||
1313 | bch_btree_write(b, true, NULL); | ||
1314 | |||
1315 | closure_return(cl); | ||
1316 | } | ||
1317 | |||
1318 | static void __cache_set_unregister(struct closure *cl) | ||
1319 | { | ||
1320 | struct cache_set *c = container_of(cl, struct cache_set, caching); | ||
1321 | struct cached_dev *dc, *t; | ||
1322 | size_t i; | ||
1323 | |||
1324 | mutex_lock(&bch_register_lock); | ||
1325 | |||
1326 | if (test_bit(CACHE_SET_UNREGISTERING, &c->flags)) | ||
1327 | list_for_each_entry_safe(dc, t, &c->cached_devs, list) | ||
1328 | bch_cached_dev_detach(dc); | ||
1329 | |||
1330 | for (i = 0; i < c->nr_uuids; i++) | ||
1331 | if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i])) | ||
1332 | bcache_device_stop(c->devices[i]); | ||
1333 | |||
1334 | mutex_unlock(&bch_register_lock); | ||
1335 | |||
1336 | continue_at(cl, cache_set_flush, system_wq); | ||
1337 | } | ||
1338 | |||
1339 | void bch_cache_set_stop(struct cache_set *c) | ||
1340 | { | ||
1341 | if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags)) | ||
1342 | closure_queue(&c->caching); | ||
1343 | } | ||
1344 | |||
1345 | void bch_cache_set_unregister(struct cache_set *c) | ||
1346 | { | ||
1347 | set_bit(CACHE_SET_UNREGISTERING, &c->flags); | ||
1348 | bch_cache_set_stop(c); | ||
1349 | } | ||
1350 | |||
1351 | #define alloc_bucket_pages(gfp, c) \ | ||
1352 | ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c)))) | ||
1353 | |||
1354 | struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | ||
1355 | { | ||
1356 | int iter_size; | ||
1357 | struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL); | ||
1358 | if (!c) | ||
1359 | return NULL; | ||
1360 | |||
1361 | __module_get(THIS_MODULE); | ||
1362 | closure_init(&c->cl, NULL); | ||
1363 | set_closure_fn(&c->cl, cache_set_free, system_wq); | ||
1364 | |||
1365 | closure_init(&c->caching, &c->cl); | ||
1366 | set_closure_fn(&c->caching, __cache_set_unregister, system_wq); | ||
1367 | |||
1368 | /* Maybe create continue_at_noreturn() and use it here? */ | ||
1369 | closure_set_stopped(&c->cl); | ||
1370 | closure_put(&c->cl); | ||
1371 | |||
1372 | kobject_init(&c->kobj, &bch_cache_set_ktype); | ||
1373 | kobject_init(&c->internal, &bch_cache_set_internal_ktype); | ||
1374 | |||
1375 | bch_cache_accounting_init(&c->accounting, &c->cl); | ||
1376 | |||
1377 | memcpy(c->sb.set_uuid, sb->set_uuid, 16); | ||
1378 | c->sb.block_size = sb->block_size; | ||
1379 | c->sb.bucket_size = sb->bucket_size; | ||
1380 | c->sb.nr_in_set = sb->nr_in_set; | ||
1381 | c->sb.last_mount = sb->last_mount; | ||
1382 | c->bucket_bits = ilog2(sb->bucket_size); | ||
1383 | c->block_bits = ilog2(sb->block_size); | ||
1384 | c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); | ||
1385 | |||
1386 | c->btree_pages = c->sb.bucket_size / PAGE_SECTORS; | ||
1387 | if (c->btree_pages > BTREE_MAX_PAGES) | ||
1388 | c->btree_pages = max_t(int, c->btree_pages / 4, | ||
1389 | BTREE_MAX_PAGES); | ||
1390 | |||
1391 | init_waitqueue_head(&c->alloc_wait); | ||
1392 | mutex_init(&c->bucket_lock); | ||
1393 | mutex_init(&c->fill_lock); | ||
1394 | mutex_init(&c->sort_lock); | ||
1395 | spin_lock_init(&c->sort_time_lock); | ||
1396 | closure_init_unlocked(&c->sb_write); | ||
1397 | closure_init_unlocked(&c->uuid_write); | ||
1398 | spin_lock_init(&c->btree_read_time_lock); | ||
1399 | bch_moving_init_cache_set(c); | ||
1400 | |||
1401 | INIT_LIST_HEAD(&c->list); | ||
1402 | INIT_LIST_HEAD(&c->cached_devs); | ||
1403 | INIT_LIST_HEAD(&c->btree_cache); | ||
1404 | INIT_LIST_HEAD(&c->btree_cache_freeable); | ||
1405 | INIT_LIST_HEAD(&c->btree_cache_freed); | ||
1406 | INIT_LIST_HEAD(&c->data_buckets); | ||
1407 | |||
1408 | c->search = mempool_create_slab_pool(32, bch_search_cache); | ||
1409 | if (!c->search) | ||
1410 | goto err; | ||
1411 | |||
1412 | iter_size = (sb->bucket_size / sb->block_size + 1) * | ||
1413 | sizeof(struct btree_iter_set); | ||
1414 | |||
1415 | if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) || | ||
1416 | !(c->bio_meta = mempool_create_kmalloc_pool(2, | ||
1417 | sizeof(struct bbio) + sizeof(struct bio_vec) * | ||
1418 | bucket_pages(c))) || | ||
1419 | !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || | ||
1420 | !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) || | ||
1421 | !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || | ||
1422 | !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || | ||
1423 | bch_journal_alloc(c) || | ||
1424 | bch_btree_cache_alloc(c) || | ||
1425 | bch_open_buckets_alloc(c)) | ||
1426 | goto err; | ||
1427 | |||
1428 | c->fill_iter->size = sb->bucket_size / sb->block_size; | ||
1429 | |||
1430 | c->congested_read_threshold_us = 2000; | ||
1431 | c->congested_write_threshold_us = 20000; | ||
1432 | c->error_limit = 8 << IO_ERROR_SHIFT; | ||
1433 | |||
1434 | return c; | ||
1435 | err: | ||
1436 | bch_cache_set_unregister(c); | ||
1437 | return NULL; | ||
1438 | } | ||
1439 | |||
1440 | static void run_cache_set(struct cache_set *c) | ||
1441 | { | ||
1442 | const char *err = "cannot allocate memory"; | ||
1443 | struct cached_dev *dc, *t; | ||
1444 | struct cache *ca; | ||
1445 | unsigned i; | ||
1446 | |||
1447 | struct btree_op op; | ||
1448 | bch_btree_op_init_stack(&op); | ||
1449 | op.lock = SHRT_MAX; | ||
1450 | |||
1451 | for_each_cache(ca, c, i) | ||
1452 | c->nbuckets += ca->sb.nbuckets; | ||
1453 | |||
1454 | if (CACHE_SYNC(&c->sb)) { | ||
1455 | LIST_HEAD(journal); | ||
1456 | struct bkey *k; | ||
1457 | struct jset *j; | ||
1458 | |||
1459 | err = "cannot allocate memory for journal"; | ||
1460 | if (bch_journal_read(c, &journal, &op)) | ||
1461 | goto err; | ||
1462 | |||
1463 | pr_debug("btree_journal_read() done"); | ||
1464 | |||
1465 | err = "no journal entries found"; | ||
1466 | if (list_empty(&journal)) | ||
1467 | goto err; | ||
1468 | |||
1469 | j = &list_entry(journal.prev, struct journal_replay, list)->j; | ||
1470 | |||
1471 | err = "IO error reading priorities"; | ||
1472 | for_each_cache(ca, c, i) | ||
1473 | prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]); | ||
1474 | |||
1475 | /* | ||
1476 | * If prio_read() fails it'll call cache_set_error and we'll | ||
1477 | * tear everything down right away, but if we perhaps checked | ||
1478 | * sooner we could avoid journal replay. | ||
1479 | */ | ||
1480 | |||
1481 | k = &j->btree_root; | ||
1482 | |||
1483 | err = "bad btree root"; | ||
1484 | if (__bch_ptr_invalid(c, j->btree_level + 1, k)) | ||
1485 | goto err; | ||
1486 | |||
1487 | err = "error reading btree root"; | ||
1488 | c->root = bch_btree_node_get(c, k, j->btree_level, &op); | ||
1489 | if (IS_ERR_OR_NULL(c->root)) | ||
1490 | goto err; | ||
1491 | |||
1492 | list_del_init(&c->root->list); | ||
1493 | rw_unlock(true, c->root); | ||
1494 | |||
1495 | err = uuid_read(c, j, &op.cl); | ||
1496 | if (err) | ||
1497 | goto err; | ||
1498 | |||
1499 | err = "error in recovery"; | ||
1500 | if (bch_btree_check(c, &op)) | ||
1501 | goto err; | ||
1502 | |||
1503 | bch_journal_mark(c, &journal); | ||
1504 | bch_btree_gc_finish(c); | ||
1505 | pr_debug("btree_check() done"); | ||
1506 | |||
1507 | /* | ||
1508 | * bcache_journal_next() can't happen sooner, or | ||
1509 | * btree_gc_finish() will give spurious errors about last_gc > | ||
1510 | * gc_gen - this is a hack but oh well. | ||
1511 | */ | ||
1512 | bch_journal_next(&c->journal); | ||
1513 | |||
1514 | for_each_cache(ca, c, i) | ||
1515 | closure_call(&ca->alloc, bch_allocator_thread, | ||
1516 | system_wq, &c->cl); | ||
1517 | |||
1518 | /* | ||
1519 | * First place it's safe to allocate: btree_check() and | ||
1520 | * btree_gc_finish() have to run before we have buckets to | ||
1521 | * allocate, and bch_bucket_alloc_set() might cause a journal | ||
1522 | * entry to be written so bcache_journal_next() has to be called | ||
1523 | * first. | ||
1524 | * | ||
1525 | * If the uuids were in the old format we have to rewrite them | ||
1526 | * before the next journal entry is written: | ||
1527 | */ | ||
1528 | if (j->version < BCACHE_JSET_VERSION_UUID) | ||
1529 | __uuid_write(c); | ||
1530 | |||
1531 | bch_journal_replay(c, &journal, &op); | ||
1532 | } else { | ||
1533 | pr_notice("invalidating existing data"); | ||
1534 | /* Don't want invalidate_buckets() to queue a gc yet */ | ||
1535 | closure_lock(&c->gc, NULL); | ||
1536 | |||
1537 | for_each_cache(ca, c, i) { | ||
1538 | unsigned j; | ||
1539 | |||
1540 | ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, | ||
1541 | 2, SB_JOURNAL_BUCKETS); | ||
1542 | |||
1543 | for (j = 0; j < ca->sb.keys; j++) | ||
1544 | ca->sb.d[j] = ca->sb.first_bucket + j; | ||
1545 | } | ||
1546 | |||
1547 | bch_btree_gc_finish(c); | ||
1548 | |||
1549 | for_each_cache(ca, c, i) | ||
1550 | closure_call(&ca->alloc, bch_allocator_thread, | ||
1551 | ca->alloc_workqueue, &c->cl); | ||
1552 | |||
1553 | mutex_lock(&c->bucket_lock); | ||
1554 | for_each_cache(ca, c, i) | ||
1555 | bch_prio_write(ca); | ||
1556 | mutex_unlock(&c->bucket_lock); | ||
1557 | |||
1558 | wake_up(&c->alloc_wait); | ||
1559 | |||
1560 | err = "cannot allocate new UUID bucket"; | ||
1561 | if (__uuid_write(c)) | ||
1562 | goto err_unlock_gc; | ||
1563 | |||
1564 | err = "cannot allocate new btree root"; | ||
1565 | c->root = bch_btree_node_alloc(c, 0, &op.cl); | ||
1566 | if (IS_ERR_OR_NULL(c->root)) | ||
1567 | goto err_unlock_gc; | ||
1568 | |||
1569 | bkey_copy_key(&c->root->key, &MAX_KEY); | ||
1570 | bch_btree_write(c->root, true, &op); | ||
1571 | |||
1572 | bch_btree_set_root(c->root); | ||
1573 | rw_unlock(true, c->root); | ||
1574 | |||
1575 | /* | ||
1576 | * We don't want to write the first journal entry until | ||
1577 | * everything is set up - fortunately journal entries won't be | ||
1578 | * written until the SET_CACHE_SYNC() here: | ||
1579 | */ | ||
1580 | SET_CACHE_SYNC(&c->sb, true); | ||
1581 | |||
1582 | bch_journal_next(&c->journal); | ||
1583 | bch_journal_meta(c, &op.cl); | ||
1584 | |||
1585 | /* Unlock */ | ||
1586 | closure_set_stopped(&c->gc.cl); | ||
1587 | closure_put(&c->gc.cl); | ||
1588 | } | ||
1589 | |||
1590 | closure_sync(&op.cl); | ||
1591 | c->sb.last_mount = get_seconds(); | ||
1592 | bcache_write_super(c); | ||
1593 | |||
1594 | list_for_each_entry_safe(dc, t, &uncached_devices, list) | ||
1595 | bch_cached_dev_attach(dc, c); | ||
1596 | |||
1597 | flash_devs_run(c); | ||
1598 | |||
1599 | return; | ||
1600 | err_unlock_gc: | ||
1601 | closure_set_stopped(&c->gc.cl); | ||
1602 | closure_put(&c->gc.cl); | ||
1603 | err: | ||
1604 | closure_sync(&op.cl); | ||
1605 | /* XXX: test this, it's broken */ | ||
1606 | bch_cache_set_error(c, err); | ||
1607 | } | ||
1608 | |||
1609 | static bool can_attach_cache(struct cache *ca, struct cache_set *c) | ||
1610 | { | ||
1611 | return ca->sb.block_size == c->sb.block_size && | ||
1612 | ca->sb.bucket_size == c->sb.block_size && | ||
1613 | ca->sb.nr_in_set == c->sb.nr_in_set; | ||
1614 | } | ||
1615 | |||
1616 | static const char *register_cache_set(struct cache *ca) | ||
1617 | { | ||
1618 | char buf[12]; | ||
1619 | const char *err = "cannot allocate memory"; | ||
1620 | struct cache_set *c; | ||
1621 | |||
1622 | list_for_each_entry(c, &bch_cache_sets, list) | ||
1623 | if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) { | ||
1624 | if (c->cache[ca->sb.nr_this_dev]) | ||
1625 | return "duplicate cache set member"; | ||
1626 | |||
1627 | if (!can_attach_cache(ca, c)) | ||
1628 | return "cache sb does not match set"; | ||
1629 | |||
1630 | if (!CACHE_SYNC(&ca->sb)) | ||
1631 | SET_CACHE_SYNC(&c->sb, false); | ||
1632 | |||
1633 | goto found; | ||
1634 | } | ||
1635 | |||
1636 | c = bch_cache_set_alloc(&ca->sb); | ||
1637 | if (!c) | ||
1638 | return err; | ||
1639 | |||
1640 | err = "error creating kobject"; | ||
1641 | if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) || | ||
1642 | kobject_add(&c->internal, &c->kobj, "internal")) | ||
1643 | goto err; | ||
1644 | |||
1645 | if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj)) | ||
1646 | goto err; | ||
1647 | |||
1648 | bch_debug_init_cache_set(c); | ||
1649 | |||
1650 | list_add(&c->list, &bch_cache_sets); | ||
1651 | found: | ||
1652 | sprintf(buf, "cache%i", ca->sb.nr_this_dev); | ||
1653 | if (sysfs_create_link(&ca->kobj, &c->kobj, "set") || | ||
1654 | sysfs_create_link(&c->kobj, &ca->kobj, buf)) | ||
1655 | goto err; | ||
1656 | |||
1657 | if (ca->sb.seq > c->sb.seq) { | ||
1658 | c->sb.version = ca->sb.version; | ||
1659 | memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16); | ||
1660 | c->sb.flags = ca->sb.flags; | ||
1661 | c->sb.seq = ca->sb.seq; | ||
1662 | pr_debug("set version = %llu", c->sb.version); | ||
1663 | } | ||
1664 | |||
1665 | ca->set = c; | ||
1666 | ca->set->cache[ca->sb.nr_this_dev] = ca; | ||
1667 | c->cache_by_alloc[c->caches_loaded++] = ca; | ||
1668 | |||
1669 | if (c->caches_loaded == c->sb.nr_in_set) | ||
1670 | run_cache_set(c); | ||
1671 | |||
1672 | return NULL; | ||
1673 | err: | ||
1674 | bch_cache_set_unregister(c); | ||
1675 | return err; | ||
1676 | } | ||
1677 | |||
1678 | /* Cache device */ | ||
1679 | |||
1680 | void bch_cache_release(struct kobject *kobj) | ||
1681 | { | ||
1682 | struct cache *ca = container_of(kobj, struct cache, kobj); | ||
1683 | |||
1684 | if (ca->set) | ||
1685 | ca->set->cache[ca->sb.nr_this_dev] = NULL; | ||
1686 | |||
1687 | bch_cache_allocator_exit(ca); | ||
1688 | |||
1689 | bio_split_pool_free(&ca->bio_split_hook); | ||
1690 | |||
1691 | if (ca->alloc_workqueue) | ||
1692 | destroy_workqueue(ca->alloc_workqueue); | ||
1693 | |||
1694 | free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); | ||
1695 | kfree(ca->prio_buckets); | ||
1696 | vfree(ca->buckets); | ||
1697 | |||
1698 | free_heap(&ca->heap); | ||
1699 | free_fifo(&ca->unused); | ||
1700 | free_fifo(&ca->free_inc); | ||
1701 | free_fifo(&ca->free); | ||
1702 | |||
1703 | if (ca->sb_bio.bi_inline_vecs[0].bv_page) | ||
1704 | put_page(ca->sb_bio.bi_io_vec[0].bv_page); | ||
1705 | |||
1706 | if (!IS_ERR_OR_NULL(ca->bdev)) { | ||
1707 | blk_sync_queue(bdev_get_queue(ca->bdev)); | ||
1708 | blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); | ||
1709 | } | ||
1710 | |||
1711 | kfree(ca); | ||
1712 | module_put(THIS_MODULE); | ||
1713 | } | ||
1714 | |||
1715 | static int cache_alloc(struct cache_sb *sb, struct cache *ca) | ||
1716 | { | ||
1717 | size_t free; | ||
1718 | struct bucket *b; | ||
1719 | |||
1720 | if (!ca) | ||
1721 | return -ENOMEM; | ||
1722 | |||
1723 | __module_get(THIS_MODULE); | ||
1724 | kobject_init(&ca->kobj, &bch_cache_ktype); | ||
1725 | |||
1726 | memcpy(&ca->sb, sb, sizeof(struct cache_sb)); | ||
1727 | |||
1728 | INIT_LIST_HEAD(&ca->discards); | ||
1729 | |||
1730 | bio_init(&ca->sb_bio); | ||
1731 | ca->sb_bio.bi_max_vecs = 1; | ||
1732 | ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs; | ||
1733 | |||
1734 | bio_init(&ca->journal.bio); | ||
1735 | ca->journal.bio.bi_max_vecs = 8; | ||
1736 | ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; | ||
1737 | |||
1738 | free = roundup_pow_of_two(ca->sb.nbuckets) >> 9; | ||
1739 | free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2); | ||
1740 | |||
1741 | if (!init_fifo(&ca->free, free, GFP_KERNEL) || | ||
1742 | !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || | ||
1743 | !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || | ||
1744 | !init_heap(&ca->heap, free << 3, GFP_KERNEL) || | ||
1745 | !(ca->buckets = vmalloc(sizeof(struct bucket) * | ||
1746 | ca->sb.nbuckets)) || | ||
1747 | !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * | ||
1748 | 2, GFP_KERNEL)) || | ||
1749 | !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || | ||
1750 | !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) || | ||
1751 | bio_split_pool_init(&ca->bio_split_hook)) | ||
1752 | goto err; | ||
1753 | |||
1754 | ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); | ||
1755 | |||
1756 | memset(ca->buckets, 0, ca->sb.nbuckets * sizeof(struct bucket)); | ||
1757 | for_each_bucket(b, ca) | ||
1758 | atomic_set(&b->pin, 0); | ||
1759 | |||
1760 | if (bch_cache_allocator_init(ca)) | ||
1761 | goto err; | ||
1762 | |||
1763 | return 0; | ||
1764 | err: | ||
1765 | kobject_put(&ca->kobj); | ||
1766 | return -ENOMEM; | ||
1767 | } | ||
1768 | |||
1769 | static const char *register_cache(struct cache_sb *sb, struct page *sb_page, | ||
1770 | struct block_device *bdev, struct cache *ca) | ||
1771 | { | ||
1772 | char name[BDEVNAME_SIZE]; | ||
1773 | const char *err = "cannot allocate memory"; | ||
1774 | |||
1775 | if (cache_alloc(sb, ca) != 0) | ||
1776 | return err; | ||
1777 | |||
1778 | ca->sb_bio.bi_io_vec[0].bv_page = sb_page; | ||
1779 | ca->bdev = bdev; | ||
1780 | ca->bdev->bd_holder = ca; | ||
1781 | |||
1782 | if (blk_queue_discard(bdev_get_queue(ca->bdev))) | ||
1783 | ca->discard = CACHE_DISCARD(&ca->sb); | ||
1784 | |||
1785 | err = "error creating kobject"; | ||
1786 | if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) | ||
1787 | goto err; | ||
1788 | |||
1789 | err = register_cache_set(ca); | ||
1790 | if (err) | ||
1791 | goto err; | ||
1792 | |||
1793 | pr_info("registered cache device %s", bdevname(bdev, name)); | ||
1794 | |||
1795 | return NULL; | ||
1796 | err: | ||
1797 | kobject_put(&ca->kobj); | ||
1798 | pr_info("error opening %s: %s", bdevname(bdev, name), err); | ||
1799 | /* Return NULL instead of an error because kobject_put() cleans | ||
1800 | * everything up | ||
1801 | */ | ||
1802 | return NULL; | ||
1803 | } | ||
1804 | |||
1805 | /* Global interfaces/init */ | ||
1806 | |||
1807 | static ssize_t register_bcache(struct kobject *, struct kobj_attribute *, | ||
1808 | const char *, size_t); | ||
1809 | |||
1810 | kobj_attribute_write(register, register_bcache); | ||
1811 | kobj_attribute_write(register_quiet, register_bcache); | ||
1812 | |||
1813 | static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, | ||
1814 | const char *buffer, size_t size) | ||
1815 | { | ||
1816 | ssize_t ret = size; | ||
1817 | const char *err = "cannot allocate memory"; | ||
1818 | char *path = NULL; | ||
1819 | struct cache_sb *sb = NULL; | ||
1820 | struct block_device *bdev = NULL; | ||
1821 | struct page *sb_page = NULL; | ||
1822 | |||
1823 | if (!try_module_get(THIS_MODULE)) | ||
1824 | return -EBUSY; | ||
1825 | |||
1826 | mutex_lock(&bch_register_lock); | ||
1827 | |||
1828 | if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || | ||
1829 | !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) | ||
1830 | goto err; | ||
1831 | |||
1832 | err = "failed to open device"; | ||
1833 | bdev = blkdev_get_by_path(strim(path), | ||
1834 | FMODE_READ|FMODE_WRITE|FMODE_EXCL, | ||
1835 | sb); | ||
1836 | if (bdev == ERR_PTR(-EBUSY)) | ||
1837 | err = "device busy"; | ||
1838 | |||
1839 | if (IS_ERR(bdev) || | ||
1840 | set_blocksize(bdev, 4096)) | ||
1841 | goto err; | ||
1842 | |||
1843 | err = read_super(sb, bdev, &sb_page); | ||
1844 | if (err) | ||
1845 | goto err_close; | ||
1846 | |||
1847 | if (SB_IS_BDEV(sb)) { | ||
1848 | struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); | ||
1849 | |||
1850 | err = register_bdev(sb, sb_page, bdev, dc); | ||
1851 | } else { | ||
1852 | struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
1853 | |||
1854 | err = register_cache(sb, sb_page, bdev, ca); | ||
1855 | } | ||
1856 | |||
1857 | if (err) { | ||
1858 | /* register_(bdev|cache) will only return an error if they | ||
1859 | * didn't get far enough to create the kobject - if they did, | ||
1860 | * the kobject destructor will do this cleanup. | ||
1861 | */ | ||
1862 | put_page(sb_page); | ||
1863 | err_close: | ||
1864 | blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); | ||
1865 | err: | ||
1866 | if (attr != &ksysfs_register_quiet) | ||
1867 | pr_info("error opening %s: %s", path, err); | ||
1868 | ret = -EINVAL; | ||
1869 | } | ||
1870 | |||
1871 | kfree(sb); | ||
1872 | kfree(path); | ||
1873 | mutex_unlock(&bch_register_lock); | ||
1874 | module_put(THIS_MODULE); | ||
1875 | return ret; | ||
1876 | } | ||
1877 | |||
1878 | static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) | ||
1879 | { | ||
1880 | if (code == SYS_DOWN || | ||
1881 | code == SYS_HALT || | ||
1882 | code == SYS_POWER_OFF) { | ||
1883 | DEFINE_WAIT(wait); | ||
1884 | unsigned long start = jiffies; | ||
1885 | bool stopped = false; | ||
1886 | |||
1887 | struct cache_set *c, *tc; | ||
1888 | struct cached_dev *dc, *tdc; | ||
1889 | |||
1890 | mutex_lock(&bch_register_lock); | ||
1891 | |||
1892 | if (list_empty(&bch_cache_sets) && | ||
1893 | list_empty(&uncached_devices)) | ||
1894 | goto out; | ||
1895 | |||
1896 | pr_info("Stopping all devices:"); | ||
1897 | |||
1898 | list_for_each_entry_safe(c, tc, &bch_cache_sets, list) | ||
1899 | bch_cache_set_stop(c); | ||
1900 | |||
1901 | list_for_each_entry_safe(dc, tdc, &uncached_devices, list) | ||
1902 | bcache_device_stop(&dc->disk); | ||
1903 | |||
1904 | /* What's a condition variable? */ | ||
1905 | while (1) { | ||
1906 | long timeout = start + 2 * HZ - jiffies; | ||
1907 | |||
1908 | stopped = list_empty(&bch_cache_sets) && | ||
1909 | list_empty(&uncached_devices); | ||
1910 | |||
1911 | if (timeout < 0 || stopped) | ||
1912 | break; | ||
1913 | |||
1914 | prepare_to_wait(&unregister_wait, &wait, | ||
1915 | TASK_UNINTERRUPTIBLE); | ||
1916 | |||
1917 | mutex_unlock(&bch_register_lock); | ||
1918 | schedule_timeout(timeout); | ||
1919 | mutex_lock(&bch_register_lock); | ||
1920 | } | ||
1921 | |||
1922 | finish_wait(&unregister_wait, &wait); | ||
1923 | |||
1924 | if (stopped) | ||
1925 | pr_info("All devices stopped"); | ||
1926 | else | ||
1927 | pr_notice("Timeout waiting for devices to be closed"); | ||
1928 | out: | ||
1929 | mutex_unlock(&bch_register_lock); | ||
1930 | } | ||
1931 | |||
1932 | return NOTIFY_DONE; | ||
1933 | } | ||
1934 | |||
1935 | static struct notifier_block reboot = { | ||
1936 | .notifier_call = bcache_reboot, | ||
1937 | .priority = INT_MAX, /* before any real devices */ | ||
1938 | }; | ||
1939 | |||
1940 | static void bcache_exit(void) | ||
1941 | { | ||
1942 | bch_debug_exit(); | ||
1943 | bch_writeback_exit(); | ||
1944 | bch_request_exit(); | ||
1945 | bch_btree_exit(); | ||
1946 | if (bcache_kobj) | ||
1947 | kobject_put(bcache_kobj); | ||
1948 | if (bcache_wq) | ||
1949 | destroy_workqueue(bcache_wq); | ||
1950 | unregister_blkdev(bcache_major, "bcache"); | ||
1951 | unregister_reboot_notifier(&reboot); | ||
1952 | } | ||
1953 | |||
1954 | static int __init bcache_init(void) | ||
1955 | { | ||
1956 | static const struct attribute *files[] = { | ||
1957 | &ksysfs_register.attr, | ||
1958 | &ksysfs_register_quiet.attr, | ||
1959 | NULL | ||
1960 | }; | ||
1961 | |||
1962 | mutex_init(&bch_register_lock); | ||
1963 | init_waitqueue_head(&unregister_wait); | ||
1964 | register_reboot_notifier(&reboot); | ||
1965 | closure_debug_init(); | ||
1966 | |||
1967 | bcache_major = register_blkdev(0, "bcache"); | ||
1968 | if (bcache_major < 0) | ||
1969 | return bcache_major; | ||
1970 | |||
1971 | if (!(bcache_wq = create_workqueue("bcache")) || | ||
1972 | !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || | ||
1973 | sysfs_create_files(bcache_kobj, files) || | ||
1974 | bch_btree_init() || | ||
1975 | bch_request_init() || | ||
1976 | bch_writeback_init() || | ||
1977 | bch_debug_init(bcache_kobj)) | ||
1978 | goto err; | ||
1979 | |||
1980 | return 0; | ||
1981 | err: | ||
1982 | bcache_exit(); | ||
1983 | return -ENOMEM; | ||
1984 | } | ||
1985 | |||
1986 | module_exit(bcache_exit); | ||
1987 | module_init(bcache_init); | ||
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c new file mode 100644 index 000000000000..4d9cca47e4c6 --- /dev/null +++ b/drivers/md/bcache/sysfs.c | |||
@@ -0,0 +1,817 @@ | |||
1 | /* | ||
2 | * bcache sysfs interfaces | ||
3 | * | ||
4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
5 | * Copyright 2012 Google, Inc. | ||
6 | */ | ||
7 | |||
8 | #include "bcache.h" | ||
9 | #include "sysfs.h" | ||
10 | #include "btree.h" | ||
11 | #include "request.h" | ||
12 | |||
13 | #include <linux/sort.h> | ||
14 | |||
15 | static const char * const cache_replacement_policies[] = { | ||
16 | "lru", | ||
17 | "fifo", | ||
18 | "random", | ||
19 | NULL | ||
20 | }; | ||
21 | |||
22 | write_attribute(attach); | ||
23 | write_attribute(detach); | ||
24 | write_attribute(unregister); | ||
25 | write_attribute(stop); | ||
26 | write_attribute(clear_stats); | ||
27 | write_attribute(trigger_gc); | ||
28 | write_attribute(prune_cache); | ||
29 | write_attribute(flash_vol_create); | ||
30 | |||
31 | read_attribute(bucket_size); | ||
32 | read_attribute(block_size); | ||
33 | read_attribute(nbuckets); | ||
34 | read_attribute(tree_depth); | ||
35 | read_attribute(root_usage_percent); | ||
36 | read_attribute(priority_stats); | ||
37 | read_attribute(btree_cache_size); | ||
38 | read_attribute(btree_cache_max_chain); | ||
39 | read_attribute(cache_available_percent); | ||
40 | read_attribute(written); | ||
41 | read_attribute(btree_written); | ||
42 | read_attribute(metadata_written); | ||
43 | read_attribute(active_journal_entries); | ||
44 | |||
45 | sysfs_time_stats_attribute(btree_gc, sec, ms); | ||
46 | sysfs_time_stats_attribute(btree_split, sec, us); | ||
47 | sysfs_time_stats_attribute(btree_sort, ms, us); | ||
48 | sysfs_time_stats_attribute(btree_read, ms, us); | ||
49 | sysfs_time_stats_attribute(try_harder, ms, us); | ||
50 | |||
51 | read_attribute(btree_nodes); | ||
52 | read_attribute(btree_used_percent); | ||
53 | read_attribute(average_key_size); | ||
54 | read_attribute(dirty_data); | ||
55 | read_attribute(bset_tree_stats); | ||
56 | |||
57 | read_attribute(state); | ||
58 | read_attribute(cache_read_races); | ||
59 | read_attribute(writeback_keys_done); | ||
60 | read_attribute(writeback_keys_failed); | ||
61 | read_attribute(io_errors); | ||
62 | read_attribute(congested); | ||
63 | rw_attribute(congested_read_threshold_us); | ||
64 | rw_attribute(congested_write_threshold_us); | ||
65 | |||
66 | rw_attribute(sequential_cutoff); | ||
67 | rw_attribute(sequential_merge); | ||
68 | rw_attribute(data_csum); | ||
69 | rw_attribute(cache_mode); | ||
70 | rw_attribute(writeback_metadata); | ||
71 | rw_attribute(writeback_running); | ||
72 | rw_attribute(writeback_percent); | ||
73 | rw_attribute(writeback_delay); | ||
74 | rw_attribute(writeback_rate); | ||
75 | |||
76 | rw_attribute(writeback_rate_update_seconds); | ||
77 | rw_attribute(writeback_rate_d_term); | ||
78 | rw_attribute(writeback_rate_p_term_inverse); | ||
79 | rw_attribute(writeback_rate_d_smooth); | ||
80 | read_attribute(writeback_rate_debug); | ||
81 | |||
82 | rw_attribute(synchronous); | ||
83 | rw_attribute(journal_delay_ms); | ||
84 | rw_attribute(discard); | ||
85 | rw_attribute(running); | ||
86 | rw_attribute(label); | ||
87 | rw_attribute(readahead); | ||
88 | rw_attribute(io_error_limit); | ||
89 | rw_attribute(io_error_halflife); | ||
90 | rw_attribute(verify); | ||
91 | rw_attribute(key_merging_disabled); | ||
92 | rw_attribute(gc_always_rewrite); | ||
93 | rw_attribute(freelist_percent); | ||
94 | rw_attribute(cache_replacement_policy); | ||
95 | rw_attribute(btree_shrinker_disabled); | ||
96 | rw_attribute(copy_gc_enabled); | ||
97 | rw_attribute(size); | ||
98 | |||
99 | SHOW(__bch_cached_dev) | ||
100 | { | ||
101 | struct cached_dev *dc = container_of(kobj, struct cached_dev, | ||
102 | disk.kobj); | ||
103 | const char *states[] = { "no cache", "clean", "dirty", "inconsistent" }; | ||
104 | |||
105 | #define var(stat) (dc->stat) | ||
106 | |||
107 | if (attr == &sysfs_cache_mode) | ||
108 | return bch_snprint_string_list(buf, PAGE_SIZE, | ||
109 | bch_cache_modes + 1, | ||
110 | BDEV_CACHE_MODE(&dc->sb)); | ||
111 | |||
112 | sysfs_printf(data_csum, "%i", dc->disk.data_csum); | ||
113 | var_printf(verify, "%i"); | ||
114 | var_printf(writeback_metadata, "%i"); | ||
115 | var_printf(writeback_running, "%i"); | ||
116 | var_print(writeback_delay); | ||
117 | var_print(writeback_percent); | ||
118 | sysfs_print(writeback_rate, dc->writeback_rate.rate); | ||
119 | |||
120 | var_print(writeback_rate_update_seconds); | ||
121 | var_print(writeback_rate_d_term); | ||
122 | var_print(writeback_rate_p_term_inverse); | ||
123 | var_print(writeback_rate_d_smooth); | ||
124 | |||
125 | if (attr == &sysfs_writeback_rate_debug) { | ||
126 | char dirty[20]; | ||
127 | char derivative[20]; | ||
128 | char target[20]; | ||
129 | bch_hprint(dirty, | ||
130 | atomic_long_read(&dc->disk.sectors_dirty) << 9); | ||
131 | bch_hprint(derivative, dc->writeback_rate_derivative << 9); | ||
132 | bch_hprint(target, dc->writeback_rate_target << 9); | ||
133 | |||
134 | return sprintf(buf, | ||
135 | "rate:\t\t%u\n" | ||
136 | "change:\t\t%i\n" | ||
137 | "dirty:\t\t%s\n" | ||
138 | "derivative:\t%s\n" | ||
139 | "target:\t\t%s\n", | ||
140 | dc->writeback_rate.rate, | ||
141 | dc->writeback_rate_change, | ||
142 | dirty, derivative, target); | ||
143 | } | ||
144 | |||
145 | sysfs_hprint(dirty_data, | ||
146 | atomic_long_read(&dc->disk.sectors_dirty) << 9); | ||
147 | |||
148 | var_printf(sequential_merge, "%i"); | ||
149 | var_hprint(sequential_cutoff); | ||
150 | var_hprint(readahead); | ||
151 | |||
152 | sysfs_print(running, atomic_read(&dc->running)); | ||
153 | sysfs_print(state, states[BDEV_STATE(&dc->sb)]); | ||
154 | |||
155 | if (attr == &sysfs_label) { | ||
156 | memcpy(buf, dc->sb.label, SB_LABEL_SIZE); | ||
157 | buf[SB_LABEL_SIZE + 1] = '\0'; | ||
158 | strcat(buf, "\n"); | ||
159 | return strlen(buf); | ||
160 | } | ||
161 | |||
162 | #undef var | ||
163 | return 0; | ||
164 | } | ||
165 | SHOW_LOCKED(bch_cached_dev) | ||
166 | |||
167 | STORE(__cached_dev) | ||
168 | { | ||
169 | struct cached_dev *dc = container_of(kobj, struct cached_dev, | ||
170 | disk.kobj); | ||
171 | unsigned v = size; | ||
172 | struct cache_set *c; | ||
173 | |||
174 | #define d_strtoul(var) sysfs_strtoul(var, dc->var) | ||
175 | #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) | ||
176 | |||
177 | sysfs_strtoul(data_csum, dc->disk.data_csum); | ||
178 | d_strtoul(verify); | ||
179 | d_strtoul(writeback_metadata); | ||
180 | d_strtoul(writeback_running); | ||
181 | d_strtoul(writeback_delay); | ||
182 | sysfs_strtoul_clamp(writeback_rate, | ||
183 | dc->writeback_rate.rate, 1, 1000000); | ||
184 | sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); | ||
185 | |||
186 | d_strtoul(writeback_rate_update_seconds); | ||
187 | d_strtoul(writeback_rate_d_term); | ||
188 | d_strtoul(writeback_rate_p_term_inverse); | ||
189 | sysfs_strtoul_clamp(writeback_rate_p_term_inverse, | ||
190 | dc->writeback_rate_p_term_inverse, 1, INT_MAX); | ||
191 | d_strtoul(writeback_rate_d_smooth); | ||
192 | |||
193 | d_strtoul(sequential_merge); | ||
194 | d_strtoi_h(sequential_cutoff); | ||
195 | d_strtoi_h(readahead); | ||
196 | |||
197 | if (attr == &sysfs_clear_stats) | ||
198 | bch_cache_accounting_clear(&dc->accounting); | ||
199 | |||
200 | if (attr == &sysfs_running && | ||
201 | strtoul_or_return(buf)) | ||
202 | bch_cached_dev_run(dc); | ||
203 | |||
204 | if (attr == &sysfs_cache_mode) { | ||
205 | ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1); | ||
206 | |||
207 | if (v < 0) | ||
208 | return v; | ||
209 | |||
210 | if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) { | ||
211 | SET_BDEV_CACHE_MODE(&dc->sb, v); | ||
212 | bch_write_bdev_super(dc, NULL); | ||
213 | } | ||
214 | } | ||
215 | |||
216 | if (attr == &sysfs_label) { | ||
217 | memcpy(dc->sb.label, buf, SB_LABEL_SIZE); | ||
218 | bch_write_bdev_super(dc, NULL); | ||
219 | if (dc->disk.c) { | ||
220 | memcpy(dc->disk.c->uuids[dc->disk.id].label, | ||
221 | buf, SB_LABEL_SIZE); | ||
222 | bch_uuid_write(dc->disk.c); | ||
223 | } | ||
224 | } | ||
225 | |||
226 | if (attr == &sysfs_attach) { | ||
227 | if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16) | ||
228 | return -EINVAL; | ||
229 | |||
230 | list_for_each_entry(c, &bch_cache_sets, list) { | ||
231 | v = bch_cached_dev_attach(dc, c); | ||
232 | if (!v) | ||
233 | return size; | ||
234 | } | ||
235 | |||
236 | pr_err("Can't attach %s: cache set not found", buf); | ||
237 | size = v; | ||
238 | } | ||
239 | |||
240 | if (attr == &sysfs_detach && dc->disk.c) | ||
241 | bch_cached_dev_detach(dc); | ||
242 | |||
243 | if (attr == &sysfs_stop) | ||
244 | bcache_device_stop(&dc->disk); | ||
245 | |||
246 | return size; | ||
247 | } | ||
248 | |||
249 | STORE(bch_cached_dev) | ||
250 | { | ||
251 | struct cached_dev *dc = container_of(kobj, struct cached_dev, | ||
252 | disk.kobj); | ||
253 | |||
254 | mutex_lock(&bch_register_lock); | ||
255 | size = __cached_dev_store(kobj, attr, buf, size); | ||
256 | |||
257 | if (attr == &sysfs_writeback_running) | ||
258 | bch_writeback_queue(dc); | ||
259 | |||
260 | if (attr == &sysfs_writeback_percent) | ||
261 | schedule_delayed_work(&dc->writeback_rate_update, | ||
262 | dc->writeback_rate_update_seconds * HZ); | ||
263 | |||
264 | mutex_unlock(&bch_register_lock); | ||
265 | return size; | ||
266 | } | ||
267 | |||
268 | static struct attribute *bch_cached_dev_files[] = { | ||
269 | &sysfs_attach, | ||
270 | &sysfs_detach, | ||
271 | &sysfs_stop, | ||
272 | #if 0 | ||
273 | &sysfs_data_csum, | ||
274 | #endif | ||
275 | &sysfs_cache_mode, | ||
276 | &sysfs_writeback_metadata, | ||
277 | &sysfs_writeback_running, | ||
278 | &sysfs_writeback_delay, | ||
279 | &sysfs_writeback_percent, | ||
280 | &sysfs_writeback_rate, | ||
281 | &sysfs_writeback_rate_update_seconds, | ||
282 | &sysfs_writeback_rate_d_term, | ||
283 | &sysfs_writeback_rate_p_term_inverse, | ||
284 | &sysfs_writeback_rate_d_smooth, | ||
285 | &sysfs_writeback_rate_debug, | ||
286 | &sysfs_dirty_data, | ||
287 | &sysfs_sequential_cutoff, | ||
288 | &sysfs_sequential_merge, | ||
289 | &sysfs_clear_stats, | ||
290 | &sysfs_running, | ||
291 | &sysfs_state, | ||
292 | &sysfs_label, | ||
293 | &sysfs_readahead, | ||
294 | #ifdef CONFIG_BCACHE_DEBUG | ||
295 | &sysfs_verify, | ||
296 | #endif | ||
297 | NULL | ||
298 | }; | ||
299 | KTYPE(bch_cached_dev); | ||
300 | |||
301 | SHOW(bch_flash_dev) | ||
302 | { | ||
303 | struct bcache_device *d = container_of(kobj, struct bcache_device, | ||
304 | kobj); | ||
305 | struct uuid_entry *u = &d->c->uuids[d->id]; | ||
306 | |||
307 | sysfs_printf(data_csum, "%i", d->data_csum); | ||
308 | sysfs_hprint(size, u->sectors << 9); | ||
309 | |||
310 | if (attr == &sysfs_label) { | ||
311 | memcpy(buf, u->label, SB_LABEL_SIZE); | ||
312 | buf[SB_LABEL_SIZE + 1] = '\0'; | ||
313 | strcat(buf, "\n"); | ||
314 | return strlen(buf); | ||
315 | } | ||
316 | |||
317 | return 0; | ||
318 | } | ||
319 | |||
320 | STORE(__bch_flash_dev) | ||
321 | { | ||
322 | struct bcache_device *d = container_of(kobj, struct bcache_device, | ||
323 | kobj); | ||
324 | struct uuid_entry *u = &d->c->uuids[d->id]; | ||
325 | |||
326 | sysfs_strtoul(data_csum, d->data_csum); | ||
327 | |||
328 | if (attr == &sysfs_size) { | ||
329 | uint64_t v; | ||
330 | strtoi_h_or_return(buf, v); | ||
331 | |||
332 | u->sectors = v >> 9; | ||
333 | bch_uuid_write(d->c); | ||
334 | set_capacity(d->disk, u->sectors); | ||
335 | } | ||
336 | |||
337 | if (attr == &sysfs_label) { | ||
338 | memcpy(u->label, buf, SB_LABEL_SIZE); | ||
339 | bch_uuid_write(d->c); | ||
340 | } | ||
341 | |||
342 | if (attr == &sysfs_unregister) { | ||
343 | atomic_set(&d->detaching, 1); | ||
344 | bcache_device_stop(d); | ||
345 | } | ||
346 | |||
347 | return size; | ||
348 | } | ||
349 | STORE_LOCKED(bch_flash_dev) | ||
350 | |||
351 | static struct attribute *bch_flash_dev_files[] = { | ||
352 | &sysfs_unregister, | ||
353 | #if 0 | ||
354 | &sysfs_data_csum, | ||
355 | #endif | ||
356 | &sysfs_label, | ||
357 | &sysfs_size, | ||
358 | NULL | ||
359 | }; | ||
360 | KTYPE(bch_flash_dev); | ||
361 | |||
362 | SHOW(__bch_cache_set) | ||
363 | { | ||
364 | unsigned root_usage(struct cache_set *c) | ||
365 | { | ||
366 | unsigned bytes = 0; | ||
367 | struct bkey *k; | ||
368 | struct btree *b; | ||
369 | struct btree_iter iter; | ||
370 | |||
371 | goto lock_root; | ||
372 | |||
373 | do { | ||
374 | rw_unlock(false, b); | ||
375 | lock_root: | ||
376 | b = c->root; | ||
377 | rw_lock(false, b, b->level); | ||
378 | } while (b != c->root); | ||
379 | |||
380 | for_each_key_filter(b, k, &iter, bch_ptr_bad) | ||
381 | bytes += bkey_bytes(k); | ||
382 | |||
383 | rw_unlock(false, b); | ||
384 | |||
385 | return (bytes * 100) / btree_bytes(c); | ||
386 | } | ||
387 | |||
388 | size_t cache_size(struct cache_set *c) | ||
389 | { | ||
390 | size_t ret = 0; | ||
391 | struct btree *b; | ||
392 | |||
393 | mutex_lock(&c->bucket_lock); | ||
394 | list_for_each_entry(b, &c->btree_cache, list) | ||
395 | ret += 1 << (b->page_order + PAGE_SHIFT); | ||
396 | |||
397 | mutex_unlock(&c->bucket_lock); | ||
398 | return ret; | ||
399 | } | ||
400 | |||
401 | unsigned cache_max_chain(struct cache_set *c) | ||
402 | { | ||
403 | unsigned ret = 0; | ||
404 | struct hlist_head *h; | ||
405 | |||
406 | mutex_lock(&c->bucket_lock); | ||
407 | |||
408 | for (h = c->bucket_hash; | ||
409 | h < c->bucket_hash + (1 << BUCKET_HASH_BITS); | ||
410 | h++) { | ||
411 | unsigned i = 0; | ||
412 | struct hlist_node *p; | ||
413 | |||
414 | hlist_for_each(p, h) | ||
415 | i++; | ||
416 | |||
417 | ret = max(ret, i); | ||
418 | } | ||
419 | |||
420 | mutex_unlock(&c->bucket_lock); | ||
421 | return ret; | ||
422 | } | ||
423 | |||
424 | unsigned btree_used(struct cache_set *c) | ||
425 | { | ||
426 | return div64_u64(c->gc_stats.key_bytes * 100, | ||
427 | (c->gc_stats.nodes ?: 1) * btree_bytes(c)); | ||
428 | } | ||
429 | |||
430 | unsigned average_key_size(struct cache_set *c) | ||
431 | { | ||
432 | return c->gc_stats.nkeys | ||
433 | ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) | ||
434 | : 0; | ||
435 | } | ||
436 | |||
437 | struct cache_set *c = container_of(kobj, struct cache_set, kobj); | ||
438 | |||
439 | sysfs_print(synchronous, CACHE_SYNC(&c->sb)); | ||
440 | sysfs_print(journal_delay_ms, c->journal_delay_ms); | ||
441 | sysfs_hprint(bucket_size, bucket_bytes(c)); | ||
442 | sysfs_hprint(block_size, block_bytes(c)); | ||
443 | sysfs_print(tree_depth, c->root->level); | ||
444 | sysfs_print(root_usage_percent, root_usage(c)); | ||
445 | |||
446 | sysfs_hprint(btree_cache_size, cache_size(c)); | ||
447 | sysfs_print(btree_cache_max_chain, cache_max_chain(c)); | ||
448 | sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use); | ||
449 | |||
450 | sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); | ||
451 | sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); | ||
452 | sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us); | ||
453 | sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); | ||
454 | sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); | ||
455 | |||
456 | sysfs_print(btree_used_percent, btree_used(c)); | ||
457 | sysfs_print(btree_nodes, c->gc_stats.nodes); | ||
458 | sysfs_hprint(dirty_data, c->gc_stats.dirty); | ||
459 | sysfs_hprint(average_key_size, average_key_size(c)); | ||
460 | |||
461 | sysfs_print(cache_read_races, | ||
462 | atomic_long_read(&c->cache_read_races)); | ||
463 | |||
464 | sysfs_print(writeback_keys_done, | ||
465 | atomic_long_read(&c->writeback_keys_done)); | ||
466 | sysfs_print(writeback_keys_failed, | ||
467 | atomic_long_read(&c->writeback_keys_failed)); | ||
468 | |||
469 | /* See count_io_errors for why 88 */ | ||
470 | sysfs_print(io_error_halflife, c->error_decay * 88); | ||
471 | sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); | ||
472 | |||
473 | sysfs_hprint(congested, | ||
474 | ((uint64_t) bch_get_congested(c)) << 9); | ||
475 | sysfs_print(congested_read_threshold_us, | ||
476 | c->congested_read_threshold_us); | ||
477 | sysfs_print(congested_write_threshold_us, | ||
478 | c->congested_write_threshold_us); | ||
479 | |||
480 | sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); | ||
481 | sysfs_printf(verify, "%i", c->verify); | ||
482 | sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); | ||
483 | sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); | ||
484 | sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); | ||
485 | sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); | ||
486 | |||
487 | if (attr == &sysfs_bset_tree_stats) | ||
488 | return bch_bset_print_stats(c, buf); | ||
489 | |||
490 | return 0; | ||
491 | } | ||
492 | SHOW_LOCKED(bch_cache_set) | ||
493 | |||
494 | STORE(__bch_cache_set) | ||
495 | { | ||
496 | struct cache_set *c = container_of(kobj, struct cache_set, kobj); | ||
497 | |||
498 | if (attr == &sysfs_unregister) | ||
499 | bch_cache_set_unregister(c); | ||
500 | |||
501 | if (attr == &sysfs_stop) | ||
502 | bch_cache_set_stop(c); | ||
503 | |||
504 | if (attr == &sysfs_synchronous) { | ||
505 | bool sync = strtoul_or_return(buf); | ||
506 | |||
507 | if (sync != CACHE_SYNC(&c->sb)) { | ||
508 | SET_CACHE_SYNC(&c->sb, sync); | ||
509 | bcache_write_super(c); | ||
510 | } | ||
511 | } | ||
512 | |||
513 | if (attr == &sysfs_flash_vol_create) { | ||
514 | int r; | ||
515 | uint64_t v; | ||
516 | strtoi_h_or_return(buf, v); | ||
517 | |||
518 | r = bch_flash_dev_create(c, v); | ||
519 | if (r) | ||
520 | return r; | ||
521 | } | ||
522 | |||
523 | if (attr == &sysfs_clear_stats) { | ||
524 | atomic_long_set(&c->writeback_keys_done, 0); | ||
525 | atomic_long_set(&c->writeback_keys_failed, 0); | ||
526 | |||
527 | memset(&c->gc_stats, 0, sizeof(struct gc_stat)); | ||
528 | bch_cache_accounting_clear(&c->accounting); | ||
529 | } | ||
530 | |||
531 | if (attr == &sysfs_trigger_gc) | ||
532 | bch_queue_gc(c); | ||
533 | |||
534 | if (attr == &sysfs_prune_cache) { | ||
535 | struct shrink_control sc; | ||
536 | sc.gfp_mask = GFP_KERNEL; | ||
537 | sc.nr_to_scan = strtoul_or_return(buf); | ||
538 | c->shrink.shrink(&c->shrink, &sc); | ||
539 | } | ||
540 | |||
541 | sysfs_strtoul(congested_read_threshold_us, | ||
542 | c->congested_read_threshold_us); | ||
543 | sysfs_strtoul(congested_write_threshold_us, | ||
544 | c->congested_write_threshold_us); | ||
545 | |||
546 | if (attr == &sysfs_io_error_limit) | ||
547 | c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; | ||
548 | |||
549 | /* See count_io_errors() for why 88 */ | ||
550 | if (attr == &sysfs_io_error_halflife) | ||
551 | c->error_decay = strtoul_or_return(buf) / 88; | ||
552 | |||
553 | sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); | ||
554 | sysfs_strtoul(verify, c->verify); | ||
555 | sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); | ||
556 | sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); | ||
557 | sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); | ||
558 | sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); | ||
559 | |||
560 | return size; | ||
561 | } | ||
562 | STORE_LOCKED(bch_cache_set) | ||
563 | |||
564 | SHOW(bch_cache_set_internal) | ||
565 | { | ||
566 | struct cache_set *c = container_of(kobj, struct cache_set, internal); | ||
567 | return bch_cache_set_show(&c->kobj, attr, buf); | ||
568 | } | ||
569 | |||
570 | STORE(bch_cache_set_internal) | ||
571 | { | ||
572 | struct cache_set *c = container_of(kobj, struct cache_set, internal); | ||
573 | return bch_cache_set_store(&c->kobj, attr, buf, size); | ||
574 | } | ||
575 | |||
576 | static void bch_cache_set_internal_release(struct kobject *k) | ||
577 | { | ||
578 | } | ||
579 | |||
580 | static struct attribute *bch_cache_set_files[] = { | ||
581 | &sysfs_unregister, | ||
582 | &sysfs_stop, | ||
583 | &sysfs_synchronous, | ||
584 | &sysfs_journal_delay_ms, | ||
585 | &sysfs_flash_vol_create, | ||
586 | |||
587 | &sysfs_bucket_size, | ||
588 | &sysfs_block_size, | ||
589 | &sysfs_tree_depth, | ||
590 | &sysfs_root_usage_percent, | ||
591 | &sysfs_btree_cache_size, | ||
592 | &sysfs_cache_available_percent, | ||
593 | |||
594 | &sysfs_average_key_size, | ||
595 | &sysfs_dirty_data, | ||
596 | |||
597 | &sysfs_io_error_limit, | ||
598 | &sysfs_io_error_halflife, | ||
599 | &sysfs_congested, | ||
600 | &sysfs_congested_read_threshold_us, | ||
601 | &sysfs_congested_write_threshold_us, | ||
602 | &sysfs_clear_stats, | ||
603 | NULL | ||
604 | }; | ||
605 | KTYPE(bch_cache_set); | ||
606 | |||
607 | static struct attribute *bch_cache_set_internal_files[] = { | ||
608 | &sysfs_active_journal_entries, | ||
609 | |||
610 | sysfs_time_stats_attribute_list(btree_gc, sec, ms) | ||
611 | sysfs_time_stats_attribute_list(btree_split, sec, us) | ||
612 | sysfs_time_stats_attribute_list(btree_sort, ms, us) | ||
613 | sysfs_time_stats_attribute_list(btree_read, ms, us) | ||
614 | sysfs_time_stats_attribute_list(try_harder, ms, us) | ||
615 | |||
616 | &sysfs_btree_nodes, | ||
617 | &sysfs_btree_used_percent, | ||
618 | &sysfs_btree_cache_max_chain, | ||
619 | |||
620 | &sysfs_bset_tree_stats, | ||
621 | &sysfs_cache_read_races, | ||
622 | &sysfs_writeback_keys_done, | ||
623 | &sysfs_writeback_keys_failed, | ||
624 | |||
625 | &sysfs_trigger_gc, | ||
626 | &sysfs_prune_cache, | ||
627 | #ifdef CONFIG_BCACHE_DEBUG | ||
628 | &sysfs_verify, | ||
629 | &sysfs_key_merging_disabled, | ||
630 | #endif | ||
631 | &sysfs_gc_always_rewrite, | ||
632 | &sysfs_btree_shrinker_disabled, | ||
633 | &sysfs_copy_gc_enabled, | ||
634 | NULL | ||
635 | }; | ||
636 | KTYPE(bch_cache_set_internal); | ||
637 | |||
638 | SHOW(__bch_cache) | ||
639 | { | ||
640 | struct cache *ca = container_of(kobj, struct cache, kobj); | ||
641 | |||
642 | sysfs_hprint(bucket_size, bucket_bytes(ca)); | ||
643 | sysfs_hprint(block_size, block_bytes(ca)); | ||
644 | sysfs_print(nbuckets, ca->sb.nbuckets); | ||
645 | sysfs_print(discard, ca->discard); | ||
646 | sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9); | ||
647 | sysfs_hprint(btree_written, | ||
648 | atomic_long_read(&ca->btree_sectors_written) << 9); | ||
649 | sysfs_hprint(metadata_written, | ||
650 | (atomic_long_read(&ca->meta_sectors_written) + | ||
651 | atomic_long_read(&ca->btree_sectors_written)) << 9); | ||
652 | |||
653 | sysfs_print(io_errors, | ||
654 | atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); | ||
655 | |||
656 | sysfs_print(freelist_percent, ca->free.size * 100 / | ||
657 | ((size_t) ca->sb.nbuckets)); | ||
658 | |||
659 | if (attr == &sysfs_cache_replacement_policy) | ||
660 | return bch_snprint_string_list(buf, PAGE_SIZE, | ||
661 | cache_replacement_policies, | ||
662 | CACHE_REPLACEMENT(&ca->sb)); | ||
663 | |||
664 | if (attr == &sysfs_priority_stats) { | ||
665 | int cmp(const void *l, const void *r) | ||
666 | { return *((uint16_t *) r) - *((uint16_t *) l); } | ||
667 | |||
668 | /* Number of quantiles we compute */ | ||
669 | const unsigned nq = 31; | ||
670 | |||
671 | size_t n = ca->sb.nbuckets, i, unused, btree; | ||
672 | uint64_t sum = 0; | ||
673 | uint16_t q[nq], *p, *cached; | ||
674 | ssize_t ret; | ||
675 | |||
676 | cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); | ||
677 | if (!p) | ||
678 | return -ENOMEM; | ||
679 | |||
680 | mutex_lock(&ca->set->bucket_lock); | ||
681 | for (i = ca->sb.first_bucket; i < n; i++) | ||
682 | p[i] = ca->buckets[i].prio; | ||
683 | mutex_unlock(&ca->set->bucket_lock); | ||
684 | |||
685 | sort(p, n, sizeof(uint16_t), cmp, NULL); | ||
686 | |||
687 | while (n && | ||
688 | !cached[n - 1]) | ||
689 | --n; | ||
690 | |||
691 | unused = ca->sb.nbuckets - n; | ||
692 | |||
693 | while (cached < p + n && | ||
694 | *cached == BTREE_PRIO) | ||
695 | cached++; | ||
696 | |||
697 | btree = cached - p; | ||
698 | n -= btree; | ||
699 | |||
700 | for (i = 0; i < n; i++) | ||
701 | sum += INITIAL_PRIO - cached[i]; | ||
702 | |||
703 | if (n) | ||
704 | do_div(sum, n); | ||
705 | |||
706 | for (i = 0; i < nq; i++) | ||
707 | q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)]; | ||
708 | |||
709 | vfree(p); | ||
710 | |||
711 | ret = snprintf(buf, PAGE_SIZE, | ||
712 | "Unused: %zu%%\n" | ||
713 | "Metadata: %zu%%\n" | ||
714 | "Average: %llu\n" | ||
715 | "Sectors per Q: %zu\n" | ||
716 | "Quantiles: [", | ||
717 | unused * 100 / (size_t) ca->sb.nbuckets, | ||
718 | btree * 100 / (size_t) ca->sb.nbuckets, sum, | ||
719 | n * ca->sb.bucket_size / (nq + 1)); | ||
720 | |||
721 | for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++) | ||
722 | ret += snprintf(buf + ret, PAGE_SIZE - ret, | ||
723 | i < nq - 1 ? "%u " : "%u]\n", q[i]); | ||
724 | |||
725 | buf[PAGE_SIZE - 1] = '\0'; | ||
726 | return ret; | ||
727 | } | ||
728 | |||
729 | return 0; | ||
730 | } | ||
731 | SHOW_LOCKED(bch_cache) | ||
732 | |||
733 | STORE(__bch_cache) | ||
734 | { | ||
735 | struct cache *ca = container_of(kobj, struct cache, kobj); | ||
736 | |||
737 | if (attr == &sysfs_discard) { | ||
738 | bool v = strtoul_or_return(buf); | ||
739 | |||
740 | if (blk_queue_discard(bdev_get_queue(ca->bdev))) | ||
741 | ca->discard = v; | ||
742 | |||
743 | if (v != CACHE_DISCARD(&ca->sb)) { | ||
744 | SET_CACHE_DISCARD(&ca->sb, v); | ||
745 | bcache_write_super(ca->set); | ||
746 | } | ||
747 | } | ||
748 | |||
749 | if (attr == &sysfs_cache_replacement_policy) { | ||
750 | ssize_t v = bch_read_string_list(buf, cache_replacement_policies); | ||
751 | |||
752 | if (v < 0) | ||
753 | return v; | ||
754 | |||
755 | if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) { | ||
756 | mutex_lock(&ca->set->bucket_lock); | ||
757 | SET_CACHE_REPLACEMENT(&ca->sb, v); | ||
758 | mutex_unlock(&ca->set->bucket_lock); | ||
759 | |||
760 | bcache_write_super(ca->set); | ||
761 | } | ||
762 | } | ||
763 | |||
764 | if (attr == &sysfs_freelist_percent) { | ||
765 | DECLARE_FIFO(long, free); | ||
766 | long i; | ||
767 | size_t p = strtoul_or_return(buf); | ||
768 | |||
769 | p = clamp_t(size_t, | ||
770 | ((size_t) ca->sb.nbuckets * p) / 100, | ||
771 | roundup_pow_of_two(ca->sb.nbuckets) >> 9, | ||
772 | ca->sb.nbuckets / 2); | ||
773 | |||
774 | if (!init_fifo_exact(&free, p, GFP_KERNEL)) | ||
775 | return -ENOMEM; | ||
776 | |||
777 | mutex_lock(&ca->set->bucket_lock); | ||
778 | |||
779 | fifo_move(&free, &ca->free); | ||
780 | fifo_swap(&free, &ca->free); | ||
781 | |||
782 | mutex_unlock(&ca->set->bucket_lock); | ||
783 | |||
784 | while (fifo_pop(&free, i)) | ||
785 | atomic_dec(&ca->buckets[i].pin); | ||
786 | |||
787 | free_fifo(&free); | ||
788 | } | ||
789 | |||
790 | if (attr == &sysfs_clear_stats) { | ||
791 | atomic_long_set(&ca->sectors_written, 0); | ||
792 | atomic_long_set(&ca->btree_sectors_written, 0); | ||
793 | atomic_long_set(&ca->meta_sectors_written, 0); | ||
794 | atomic_set(&ca->io_count, 0); | ||
795 | atomic_set(&ca->io_errors, 0); | ||
796 | } | ||
797 | |||
798 | return size; | ||
799 | } | ||
800 | STORE_LOCKED(bch_cache) | ||
801 | |||
802 | static struct attribute *bch_cache_files[] = { | ||
803 | &sysfs_bucket_size, | ||
804 | &sysfs_block_size, | ||
805 | &sysfs_nbuckets, | ||
806 | &sysfs_priority_stats, | ||
807 | &sysfs_discard, | ||
808 | &sysfs_written, | ||
809 | &sysfs_btree_written, | ||
810 | &sysfs_metadata_written, | ||
811 | &sysfs_io_errors, | ||
812 | &sysfs_clear_stats, | ||
813 | &sysfs_freelist_percent, | ||
814 | &sysfs_cache_replacement_policy, | ||
815 | NULL | ||
816 | }; | ||
817 | KTYPE(bch_cache); | ||
diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h new file mode 100644 index 000000000000..0526fe92a683 --- /dev/null +++ b/drivers/md/bcache/sysfs.h | |||
@@ -0,0 +1,110 @@ | |||
1 | #ifndef _BCACHE_SYSFS_H_ | ||
2 | #define _BCACHE_SYSFS_H_ | ||
3 | |||
4 | #define KTYPE(type) \ | ||
5 | struct kobj_type type ## _ktype = { \ | ||
6 | .release = type ## _release, \ | ||
7 | .sysfs_ops = &((const struct sysfs_ops) { \ | ||
8 | .show = type ## _show, \ | ||
9 | .store = type ## _store \ | ||
10 | }), \ | ||
11 | .default_attrs = type ## _files \ | ||
12 | } | ||
13 | |||
14 | #define SHOW(fn) \ | ||
15 | static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ | ||
16 | char *buf) \ | ||
17 | |||
18 | #define STORE(fn) \ | ||
19 | static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ | ||
20 | const char *buf, size_t size) \ | ||
21 | |||
22 | #define SHOW_LOCKED(fn) \ | ||
23 | SHOW(fn) \ | ||
24 | { \ | ||
25 | ssize_t ret; \ | ||
26 | mutex_lock(&bch_register_lock); \ | ||
27 | ret = __ ## fn ## _show(kobj, attr, buf); \ | ||
28 | mutex_unlock(&bch_register_lock); \ | ||
29 | return ret; \ | ||
30 | } | ||
31 | |||
32 | #define STORE_LOCKED(fn) \ | ||
33 | STORE(fn) \ | ||
34 | { \ | ||
35 | ssize_t ret; \ | ||
36 | mutex_lock(&bch_register_lock); \ | ||
37 | ret = __ ## fn ## _store(kobj, attr, buf, size); \ | ||
38 | mutex_unlock(&bch_register_lock); \ | ||
39 | return ret; \ | ||
40 | } | ||
41 | |||
42 | #define __sysfs_attribute(_name, _mode) \ | ||
43 | static struct attribute sysfs_##_name = \ | ||
44 | { .name = #_name, .mode = _mode } | ||
45 | |||
46 | #define write_attribute(n) __sysfs_attribute(n, S_IWUSR) | ||
47 | #define read_attribute(n) __sysfs_attribute(n, S_IRUGO) | ||
48 | #define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) | ||
49 | |||
50 | #define sysfs_printf(file, fmt, ...) \ | ||
51 | do { \ | ||
52 | if (attr == &sysfs_ ## file) \ | ||
53 | return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__); \ | ||
54 | } while (0) | ||
55 | |||
56 | #define sysfs_print(file, var) \ | ||
57 | do { \ | ||
58 | if (attr == &sysfs_ ## file) \ | ||
59 | return snprint(buf, PAGE_SIZE, var); \ | ||
60 | } while (0) | ||
61 | |||
62 | #define sysfs_hprint(file, val) \ | ||
63 | do { \ | ||
64 | if (attr == &sysfs_ ## file) { \ | ||
65 | ssize_t ret = bch_hprint(buf, val); \ | ||
66 | strcat(buf, "\n"); \ | ||
67 | return ret + 1; \ | ||
68 | } \ | ||
69 | } while (0) | ||
70 | |||
71 | #define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) | ||
72 | #define var_print(_var) sysfs_print(_var, var(_var)) | ||
73 | #define var_hprint(_var) sysfs_hprint(_var, var(_var)) | ||
74 | |||
75 | #define sysfs_strtoul(file, var) \ | ||
76 | do { \ | ||
77 | if (attr == &sysfs_ ## file) \ | ||
78 | return strtoul_safe(buf, var) ?: (ssize_t) size; \ | ||
79 | } while (0) | ||
80 | |||
81 | #define sysfs_strtoul_clamp(file, var, min, max) \ | ||
82 | do { \ | ||
83 | if (attr == &sysfs_ ## file) \ | ||
84 | return strtoul_safe_clamp(buf, var, min, max) \ | ||
85 | ?: (ssize_t) size; \ | ||
86 | } while (0) | ||
87 | |||
88 | #define strtoul_or_return(cp) \ | ||
89 | ({ \ | ||
90 | unsigned long _v; \ | ||
91 | int _r = kstrtoul(cp, 10, &_v); \ | ||
92 | if (_r) \ | ||
93 | return _r; \ | ||
94 | _v; \ | ||
95 | }) | ||
96 | |||
97 | #define strtoi_h_or_return(cp, v) \ | ||
98 | do { \ | ||
99 | int _r = strtoi_h(cp, &v); \ | ||
100 | if (_r) \ | ||
101 | return _r; \ | ||
102 | } while (0) | ||
103 | |||
104 | #define sysfs_hatoi(file, var) \ | ||
105 | do { \ | ||
106 | if (attr == &sysfs_ ## file) \ | ||
107 | return strtoi_h(buf, &var) ?: (ssize_t) size; \ | ||
108 | } while (0) | ||
109 | |||
110 | #endif /* _BCACHE_SYSFS_H_ */ | ||
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c new file mode 100644 index 000000000000..983f9bb411bc --- /dev/null +++ b/drivers/md/bcache/trace.c | |||
@@ -0,0 +1,26 @@ | |||
1 | #include "bcache.h" | ||
2 | #include "btree.h" | ||
3 | #include "request.h" | ||
4 | |||
5 | #include <linux/module.h> | ||
6 | |||
7 | #define CREATE_TRACE_POINTS | ||
8 | #include <trace/events/bcache.h> | ||
9 | |||
10 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start); | ||
11 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end); | ||
12 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough); | ||
13 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit); | ||
14 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss); | ||
15 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry); | ||
16 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough); | ||
17 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); | ||
18 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip); | ||
19 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read); | ||
20 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write); | ||
21 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty); | ||
22 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty); | ||
23 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); | ||
24 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); | ||
25 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start); | ||
26 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); | ||
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c new file mode 100644 index 000000000000..da3a99e85b1e --- /dev/null +++ b/drivers/md/bcache/util.c | |||
@@ -0,0 +1,377 @@ | |||
1 | /* | ||
2 | * random utiility code, for bcache but in theory not specific to bcache | ||
3 | * | ||
4 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
5 | * Copyright 2012 Google, Inc. | ||
6 | */ | ||
7 | |||
8 | #include <linux/bio.h> | ||
9 | #include <linux/blkdev.h> | ||
10 | #include <linux/ctype.h> | ||
11 | #include <linux/debugfs.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/seq_file.h> | ||
14 | #include <linux/types.h> | ||
15 | |||
16 | #include "util.h" | ||
17 | |||
18 | #define simple_strtoint(c, end, base) simple_strtol(c, end, base) | ||
19 | #define simple_strtouint(c, end, base) simple_strtoul(c, end, base) | ||
20 | |||
21 | #define STRTO_H(name, type) \ | ||
22 | int bch_ ## name ## _h(const char *cp, type *res) \ | ||
23 | { \ | ||
24 | int u = 0; \ | ||
25 | char *e; \ | ||
26 | type i = simple_ ## name(cp, &e, 10); \ | ||
27 | \ | ||
28 | switch (tolower(*e)) { \ | ||
29 | default: \ | ||
30 | return -EINVAL; \ | ||
31 | case 'y': \ | ||
32 | case 'z': \ | ||
33 | u++; \ | ||
34 | case 'e': \ | ||
35 | u++; \ | ||
36 | case 'p': \ | ||
37 | u++; \ | ||
38 | case 't': \ | ||
39 | u++; \ | ||
40 | case 'g': \ | ||
41 | u++; \ | ||
42 | case 'm': \ | ||
43 | u++; \ | ||
44 | case 'k': \ | ||
45 | u++; \ | ||
46 | if (e++ == cp) \ | ||
47 | return -EINVAL; \ | ||
48 | case '\n': \ | ||
49 | case '\0': \ | ||
50 | if (*e == '\n') \ | ||
51 | e++; \ | ||
52 | } \ | ||
53 | \ | ||
54 | if (*e) \ | ||
55 | return -EINVAL; \ | ||
56 | \ | ||
57 | while (u--) { \ | ||
58 | if ((type) ~0 > 0 && \ | ||
59 | (type) ~0 / 1024 <= i) \ | ||
60 | return -EINVAL; \ | ||
61 | if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \ | ||
62 | (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \ | ||
63 | return -EINVAL; \ | ||
64 | i *= 1024; \ | ||
65 | } \ | ||
66 | \ | ||
67 | *res = i; \ | ||
68 | return 0; \ | ||
69 | } \ | ||
70 | |||
71 | STRTO_H(strtoint, int) | ||
72 | STRTO_H(strtouint, unsigned int) | ||
73 | STRTO_H(strtoll, long long) | ||
74 | STRTO_H(strtoull, unsigned long long) | ||
75 | |||
76 | ssize_t bch_hprint(char *buf, int64_t v) | ||
77 | { | ||
78 | static const char units[] = "?kMGTPEZY"; | ||
79 | char dec[4] = ""; | ||
80 | int u, t = 0; | ||
81 | |||
82 | for (u = 0; v >= 1024 || v <= -1024; u++) { | ||
83 | t = v & ~(~0 << 10); | ||
84 | v >>= 10; | ||
85 | } | ||
86 | |||
87 | if (!u) | ||
88 | return sprintf(buf, "%llu", v); | ||
89 | |||
90 | if (v < 100 && v > -100) | ||
91 | snprintf(dec, sizeof(dec), ".%i", t / 100); | ||
92 | |||
93 | return sprintf(buf, "%lli%s%c", v, dec, units[u]); | ||
94 | } | ||
95 | |||
96 | ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], | ||
97 | size_t selected) | ||
98 | { | ||
99 | char *out = buf; | ||
100 | size_t i; | ||
101 | |||
102 | for (i = 0; list[i]; i++) | ||
103 | out += snprintf(out, buf + size - out, | ||
104 | i == selected ? "[%s] " : "%s ", list[i]); | ||
105 | |||
106 | out[-1] = '\n'; | ||
107 | return out - buf; | ||
108 | } | ||
109 | |||
110 | ssize_t bch_read_string_list(const char *buf, const char * const list[]) | ||
111 | { | ||
112 | size_t i; | ||
113 | char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL); | ||
114 | if (!d) | ||
115 | return -ENOMEM; | ||
116 | |||
117 | s = strim(d); | ||
118 | |||
119 | for (i = 0; list[i]; i++) | ||
120 | if (!strcmp(list[i], s)) | ||
121 | break; | ||
122 | |||
123 | kfree(d); | ||
124 | |||
125 | if (!list[i]) | ||
126 | return -EINVAL; | ||
127 | |||
128 | return i; | ||
129 | } | ||
130 | |||
131 | bool bch_is_zero(const char *p, size_t n) | ||
132 | { | ||
133 | size_t i; | ||
134 | |||
135 | for (i = 0; i < n; i++) | ||
136 | if (p[i]) | ||
137 | return false; | ||
138 | return true; | ||
139 | } | ||
140 | |||
141 | int bch_parse_uuid(const char *s, char *uuid) | ||
142 | { | ||
143 | size_t i, j, x; | ||
144 | memset(uuid, 0, 16); | ||
145 | |||
146 | for (i = 0, j = 0; | ||
147 | i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32; | ||
148 | i++) { | ||
149 | x = s[i] | 32; | ||
150 | |||
151 | switch (x) { | ||
152 | case '0'...'9': | ||
153 | x -= '0'; | ||
154 | break; | ||
155 | case 'a'...'f': | ||
156 | x -= 'a' - 10; | ||
157 | break; | ||
158 | default: | ||
159 | continue; | ||
160 | } | ||
161 | |||
162 | if (!(j & 1)) | ||
163 | x <<= 4; | ||
164 | uuid[j++ >> 1] |= x; | ||
165 | } | ||
166 | return i; | ||
167 | } | ||
168 | |||
169 | void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) | ||
170 | { | ||
171 | uint64_t now = local_clock(); | ||
172 | uint64_t duration = time_after64(now, start_time) | ||
173 | ? now - start_time : 0; | ||
174 | uint64_t last = time_after64(now, stats->last) | ||
175 | ? now - stats->last : 0; | ||
176 | |||
177 | stats->max_duration = max(stats->max_duration, duration); | ||
178 | |||
179 | if (stats->last) { | ||
180 | ewma_add(stats->average_duration, duration, 8, 8); | ||
181 | |||
182 | if (stats->average_frequency) | ||
183 | ewma_add(stats->average_frequency, last, 8, 8); | ||
184 | else | ||
185 | stats->average_frequency = last << 8; | ||
186 | } else { | ||
187 | stats->average_duration = duration << 8; | ||
188 | } | ||
189 | |||
190 | stats->last = now ?: 1; | ||
191 | } | ||
192 | |||
193 | unsigned bch_next_delay(struct ratelimit *d, uint64_t done) | ||
194 | { | ||
195 | uint64_t now = local_clock(); | ||
196 | |||
197 | d->next += div_u64(done, d->rate); | ||
198 | |||
199 | return time_after64(d->next, now) | ||
200 | ? div_u64(d->next - now, NSEC_PER_SEC / HZ) | ||
201 | : 0; | ||
202 | } | ||
203 | |||
204 | void bch_bio_map(struct bio *bio, void *base) | ||
205 | { | ||
206 | size_t size = bio->bi_size; | ||
207 | struct bio_vec *bv = bio->bi_io_vec; | ||
208 | |||
209 | BUG_ON(!bio->bi_size); | ||
210 | BUG_ON(bio->bi_vcnt); | ||
211 | |||
212 | bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0; | ||
213 | goto start; | ||
214 | |||
215 | for (; size; bio->bi_vcnt++, bv++) { | ||
216 | bv->bv_offset = 0; | ||
217 | start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, | ||
218 | size); | ||
219 | if (base) { | ||
220 | bv->bv_page = is_vmalloc_addr(base) | ||
221 | ? vmalloc_to_page(base) | ||
222 | : virt_to_page(base); | ||
223 | |||
224 | base += bv->bv_len; | ||
225 | } | ||
226 | |||
227 | size -= bv->bv_len; | ||
228 | } | ||
229 | } | ||
230 | |||
231 | int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp) | ||
232 | { | ||
233 | int i; | ||
234 | struct bio_vec *bv; | ||
235 | |||
236 | bio_for_each_segment(bv, bio, i) { | ||
237 | bv->bv_page = alloc_page(gfp); | ||
238 | if (!bv->bv_page) { | ||
239 | while (bv-- != bio->bi_io_vec + bio->bi_idx) | ||
240 | __free_page(bv->bv_page); | ||
241 | return -ENOMEM; | ||
242 | } | ||
243 | } | ||
244 | |||
245 | return 0; | ||
246 | } | ||
247 | |||
248 | /* | ||
249 | * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any | ||
250 | * use permitted, subject to terms of PostgreSQL license; see.) | ||
251 | |||
252 | * If we have a 64-bit integer type, then a 64-bit CRC looks just like the | ||
253 | * usual sort of implementation. (See Ross Williams' excellent introduction | ||
254 | * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from | ||
255 | * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.) | ||
256 | * If we have no working 64-bit type, then fake it with two 32-bit registers. | ||
257 | * | ||
258 | * The present implementation is a normal (not "reflected", in Williams' | ||
259 | * terms) 64-bit CRC, using initial all-ones register contents and a final | ||
260 | * bit inversion. The chosen polynomial is borrowed from the DLT1 spec | ||
261 | * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM): | ||
262 | * | ||
263 | * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + | ||
264 | * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + | ||
265 | * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + | ||
266 | * x^7 + x^4 + x + 1 | ||
267 | */ | ||
268 | |||
269 | static const uint64_t crc_table[256] = { | ||
270 | 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL, | ||
271 | 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL, | ||
272 | 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL, | ||
273 | 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL, | ||
274 | 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL, | ||
275 | 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL, | ||
276 | 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL, | ||
277 | 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL, | ||
278 | 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL, | ||
279 | 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL, | ||
280 | 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL, | ||
281 | 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL, | ||
282 | 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL, | ||
283 | 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL, | ||
284 | 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL, | ||
285 | 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL, | ||
286 | 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL, | ||
287 | 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL, | ||
288 | 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL, | ||
289 | 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL, | ||
290 | 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL, | ||
291 | 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL, | ||
292 | 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL, | ||
293 | 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL, | ||
294 | 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL, | ||
295 | 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL, | ||
296 | 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL, | ||
297 | 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL, | ||
298 | 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL, | ||
299 | 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL, | ||
300 | 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL, | ||
301 | 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL, | ||
302 | 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL, | ||
303 | 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL, | ||
304 | 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL, | ||
305 | 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL, | ||
306 | 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL, | ||
307 | 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL, | ||
308 | 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL, | ||
309 | 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL, | ||
310 | 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL, | ||
311 | 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL, | ||
312 | 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL, | ||
313 | 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL, | ||
314 | 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL, | ||
315 | 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL, | ||
316 | 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL, | ||
317 | 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL, | ||
318 | 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL, | ||
319 | 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL, | ||
320 | 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL, | ||
321 | 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL, | ||
322 | 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL, | ||
323 | 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL, | ||
324 | 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL, | ||
325 | 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL, | ||
326 | 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL, | ||
327 | 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL, | ||
328 | 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL, | ||
329 | 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL, | ||
330 | 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL, | ||
331 | 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL, | ||
332 | 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL, | ||
333 | 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL, | ||
334 | 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL, | ||
335 | 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL, | ||
336 | 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL, | ||
337 | 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL, | ||
338 | 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL, | ||
339 | 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL, | ||
340 | 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL, | ||
341 | 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL, | ||
342 | 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL, | ||
343 | 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL, | ||
344 | 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL, | ||
345 | 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL, | ||
346 | 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL, | ||
347 | 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL, | ||
348 | 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL, | ||
349 | 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL, | ||
350 | 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL, | ||
351 | 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL, | ||
352 | 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL, | ||
353 | 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL, | ||
354 | 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL, | ||
355 | 0x9AFCE626CE85B507ULL, | ||
356 | }; | ||
357 | |||
358 | uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len) | ||
359 | { | ||
360 | const unsigned char *data = _data; | ||
361 | |||
362 | while (len--) { | ||
363 | int i = ((int) (crc >> 56) ^ *data++) & 0xFF; | ||
364 | crc = crc_table[i] ^ (crc << 8); | ||
365 | } | ||
366 | |||
367 | return crc; | ||
368 | } | ||
369 | |||
370 | uint64_t bch_crc64(const void *data, size_t len) | ||
371 | { | ||
372 | uint64_t crc = 0xffffffffffffffffULL; | ||
373 | |||
374 | crc = bch_crc64_update(crc, data, len); | ||
375 | |||
376 | return crc ^ 0xffffffffffffffffULL; | ||
377 | } | ||
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h new file mode 100644 index 000000000000..577393e38c3a --- /dev/null +++ b/drivers/md/bcache/util.h | |||
@@ -0,0 +1,589 @@ | |||
1 | |||
2 | #ifndef _BCACHE_UTIL_H | ||
3 | #define _BCACHE_UTIL_H | ||
4 | |||
5 | #include <linux/errno.h> | ||
6 | #include <linux/kernel.h> | ||
7 | #include <linux/llist.h> | ||
8 | #include <linux/ratelimit.h> | ||
9 | #include <linux/vmalloc.h> | ||
10 | #include <linux/workqueue.h> | ||
11 | |||
12 | #include "closure.h" | ||
13 | |||
14 | #define PAGE_SECTORS (PAGE_SIZE / 512) | ||
15 | |||
16 | struct closure; | ||
17 | |||
18 | #include <trace/events/bcache.h> | ||
19 | |||
20 | #ifdef CONFIG_BCACHE_EDEBUG | ||
21 | |||
22 | #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) | ||
23 | #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) | ||
24 | |||
25 | #else /* EDEBUG */ | ||
26 | |||
27 | #define atomic_dec_bug(v) atomic_dec(v) | ||
28 | #define atomic_inc_bug(v, i) atomic_inc(v) | ||
29 | |||
30 | #endif | ||
31 | |||
32 | #define BITMASK(name, type, field, offset, size) \ | ||
33 | static inline uint64_t name(const type *k) \ | ||
34 | { return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \ | ||
35 | \ | ||
36 | static inline void SET_##name(type *k, uint64_t v) \ | ||
37 | { \ | ||
38 | k->field &= ~(~((uint64_t) ~0 << size) << offset); \ | ||
39 | k->field |= v << offset; \ | ||
40 | } | ||
41 | |||
42 | #define DECLARE_HEAP(type, name) \ | ||
43 | struct { \ | ||
44 | size_t size, used; \ | ||
45 | type *data; \ | ||
46 | } name | ||
47 | |||
48 | #define init_heap(heap, _size, gfp) \ | ||
49 | ({ \ | ||
50 | size_t _bytes; \ | ||
51 | (heap)->used = 0; \ | ||
52 | (heap)->size = (_size); \ | ||
53 | _bytes = (heap)->size * sizeof(*(heap)->data); \ | ||
54 | (heap)->data = NULL; \ | ||
55 | if (_bytes < KMALLOC_MAX_SIZE) \ | ||
56 | (heap)->data = kmalloc(_bytes, (gfp)); \ | ||
57 | if ((!(heap)->data) && ((gfp) & GFP_KERNEL)) \ | ||
58 | (heap)->data = vmalloc(_bytes); \ | ||
59 | (heap)->data; \ | ||
60 | }) | ||
61 | |||
62 | #define free_heap(heap) \ | ||
63 | do { \ | ||
64 | if (is_vmalloc_addr((heap)->data)) \ | ||
65 | vfree((heap)->data); \ | ||
66 | else \ | ||
67 | kfree((heap)->data); \ | ||
68 | (heap)->data = NULL; \ | ||
69 | } while (0) | ||
70 | |||
71 | #define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) | ||
72 | |||
73 | #define heap_sift(h, i, cmp) \ | ||
74 | do { \ | ||
75 | size_t _r, _j = i; \ | ||
76 | \ | ||
77 | for (; _j * 2 + 1 < (h)->used; _j = _r) { \ | ||
78 | _r = _j * 2 + 1; \ | ||
79 | if (_r + 1 < (h)->used && \ | ||
80 | cmp((h)->data[_r], (h)->data[_r + 1])) \ | ||
81 | _r++; \ | ||
82 | \ | ||
83 | if (cmp((h)->data[_r], (h)->data[_j])) \ | ||
84 | break; \ | ||
85 | heap_swap(h, _r, _j); \ | ||
86 | } \ | ||
87 | } while (0) | ||
88 | |||
89 | #define heap_sift_down(h, i, cmp) \ | ||
90 | do { \ | ||
91 | while (i) { \ | ||
92 | size_t p = (i - 1) / 2; \ | ||
93 | if (cmp((h)->data[i], (h)->data[p])) \ | ||
94 | break; \ | ||
95 | heap_swap(h, i, p); \ | ||
96 | i = p; \ | ||
97 | } \ | ||
98 | } while (0) | ||
99 | |||
100 | #define heap_add(h, d, cmp) \ | ||
101 | ({ \ | ||
102 | bool _r = !heap_full(h); \ | ||
103 | if (_r) { \ | ||
104 | size_t _i = (h)->used++; \ | ||
105 | (h)->data[_i] = d; \ | ||
106 | \ | ||
107 | heap_sift_down(h, _i, cmp); \ | ||
108 | heap_sift(h, _i, cmp); \ | ||
109 | } \ | ||
110 | _r; \ | ||
111 | }) | ||
112 | |||
113 | #define heap_pop(h, d, cmp) \ | ||
114 | ({ \ | ||
115 | bool _r = (h)->used; \ | ||
116 | if (_r) { \ | ||
117 | (d) = (h)->data[0]; \ | ||
118 | (h)->used--; \ | ||
119 | heap_swap(h, 0, (h)->used); \ | ||
120 | heap_sift(h, 0, cmp); \ | ||
121 | } \ | ||
122 | _r; \ | ||
123 | }) | ||
124 | |||
125 | #define heap_peek(h) ((h)->size ? (h)->data[0] : NULL) | ||
126 | |||
127 | #define heap_full(h) ((h)->used == (h)->size) | ||
128 | |||
129 | #define DECLARE_FIFO(type, name) \ | ||
130 | struct { \ | ||
131 | size_t front, back, size, mask; \ | ||
132 | type *data; \ | ||
133 | } name | ||
134 | |||
135 | #define fifo_for_each(c, fifo, iter) \ | ||
136 | for (iter = (fifo)->front; \ | ||
137 | c = (fifo)->data[iter], iter != (fifo)->back; \ | ||
138 | iter = (iter + 1) & (fifo)->mask) | ||
139 | |||
140 | #define __init_fifo(fifo, gfp) \ | ||
141 | ({ \ | ||
142 | size_t _allocated_size, _bytes; \ | ||
143 | BUG_ON(!(fifo)->size); \ | ||
144 | \ | ||
145 | _allocated_size = roundup_pow_of_two((fifo)->size + 1); \ | ||
146 | _bytes = _allocated_size * sizeof(*(fifo)->data); \ | ||
147 | \ | ||
148 | (fifo)->mask = _allocated_size - 1; \ | ||
149 | (fifo)->front = (fifo)->back = 0; \ | ||
150 | (fifo)->data = NULL; \ | ||
151 | \ | ||
152 | if (_bytes < KMALLOC_MAX_SIZE) \ | ||
153 | (fifo)->data = kmalloc(_bytes, (gfp)); \ | ||
154 | if ((!(fifo)->data) && ((gfp) & GFP_KERNEL)) \ | ||
155 | (fifo)->data = vmalloc(_bytes); \ | ||
156 | (fifo)->data; \ | ||
157 | }) | ||
158 | |||
159 | #define init_fifo_exact(fifo, _size, gfp) \ | ||
160 | ({ \ | ||
161 | (fifo)->size = (_size); \ | ||
162 | __init_fifo(fifo, gfp); \ | ||
163 | }) | ||
164 | |||
165 | #define init_fifo(fifo, _size, gfp) \ | ||
166 | ({ \ | ||
167 | (fifo)->size = (_size); \ | ||
168 | if ((fifo)->size > 4) \ | ||
169 | (fifo)->size = roundup_pow_of_two((fifo)->size) - 1; \ | ||
170 | __init_fifo(fifo, gfp); \ | ||
171 | }) | ||
172 | |||
173 | #define free_fifo(fifo) \ | ||
174 | do { \ | ||
175 | if (is_vmalloc_addr((fifo)->data)) \ | ||
176 | vfree((fifo)->data); \ | ||
177 | else \ | ||
178 | kfree((fifo)->data); \ | ||
179 | (fifo)->data = NULL; \ | ||
180 | } while (0) | ||
181 | |||
182 | #define fifo_used(fifo) (((fifo)->back - (fifo)->front) & (fifo)->mask) | ||
183 | #define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) | ||
184 | |||
185 | #define fifo_empty(fifo) (!fifo_used(fifo)) | ||
186 | #define fifo_full(fifo) (!fifo_free(fifo)) | ||
187 | |||
188 | #define fifo_front(fifo) ((fifo)->data[(fifo)->front]) | ||
189 | #define fifo_back(fifo) \ | ||
190 | ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) | ||
191 | |||
192 | #define fifo_idx(fifo, p) (((p) - &fifo_front(fifo)) & (fifo)->mask) | ||
193 | |||
194 | #define fifo_push_back(fifo, i) \ | ||
195 | ({ \ | ||
196 | bool _r = !fifo_full((fifo)); \ | ||
197 | if (_r) { \ | ||
198 | (fifo)->data[(fifo)->back++] = (i); \ | ||
199 | (fifo)->back &= (fifo)->mask; \ | ||
200 | } \ | ||
201 | _r; \ | ||
202 | }) | ||
203 | |||
204 | #define fifo_pop_front(fifo, i) \ | ||
205 | ({ \ | ||
206 | bool _r = !fifo_empty((fifo)); \ | ||
207 | if (_r) { \ | ||
208 | (i) = (fifo)->data[(fifo)->front++]; \ | ||
209 | (fifo)->front &= (fifo)->mask; \ | ||
210 | } \ | ||
211 | _r; \ | ||
212 | }) | ||
213 | |||
214 | #define fifo_push_front(fifo, i) \ | ||
215 | ({ \ | ||
216 | bool _r = !fifo_full((fifo)); \ | ||
217 | if (_r) { \ | ||
218 | --(fifo)->front; \ | ||
219 | (fifo)->front &= (fifo)->mask; \ | ||
220 | (fifo)->data[(fifo)->front] = (i); \ | ||
221 | } \ | ||
222 | _r; \ | ||
223 | }) | ||
224 | |||
225 | #define fifo_pop_back(fifo, i) \ | ||
226 | ({ \ | ||
227 | bool _r = !fifo_empty((fifo)); \ | ||
228 | if (_r) { \ | ||
229 | --(fifo)->back; \ | ||
230 | (fifo)->back &= (fifo)->mask; \ | ||
231 | (i) = (fifo)->data[(fifo)->back] \ | ||
232 | } \ | ||
233 | _r; \ | ||
234 | }) | ||
235 | |||
236 | #define fifo_push(fifo, i) fifo_push_back(fifo, (i)) | ||
237 | #define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) | ||
238 | |||
239 | #define fifo_swap(l, r) \ | ||
240 | do { \ | ||
241 | swap((l)->front, (r)->front); \ | ||
242 | swap((l)->back, (r)->back); \ | ||
243 | swap((l)->size, (r)->size); \ | ||
244 | swap((l)->mask, (r)->mask); \ | ||
245 | swap((l)->data, (r)->data); \ | ||
246 | } while (0) | ||
247 | |||
248 | #define fifo_move(dest, src) \ | ||
249 | do { \ | ||
250 | typeof(*((dest)->data)) _t; \ | ||
251 | while (!fifo_full(dest) && \ | ||
252 | fifo_pop(src, _t)) \ | ||
253 | fifo_push(dest, _t); \ | ||
254 | } while (0) | ||
255 | |||
256 | /* | ||
257 | * Simple array based allocator - preallocates a number of elements and you can | ||
258 | * never allocate more than that, also has no locking. | ||
259 | * | ||
260 | * Handy because if you know you only need a fixed number of elements you don't | ||
261 | * have to worry about memory allocation failure, and sometimes a mempool isn't | ||
262 | * what you want. | ||
263 | * | ||
264 | * We treat the free elements as entries in a singly linked list, and the | ||
265 | * freelist as a stack - allocating and freeing push and pop off the freelist. | ||
266 | */ | ||
267 | |||
268 | #define DECLARE_ARRAY_ALLOCATOR(type, name, size) \ | ||
269 | struct { \ | ||
270 | type *freelist; \ | ||
271 | type data[size]; \ | ||
272 | } name | ||
273 | |||
274 | #define array_alloc(array) \ | ||
275 | ({ \ | ||
276 | typeof((array)->freelist) _ret = (array)->freelist; \ | ||
277 | \ | ||
278 | if (_ret) \ | ||
279 | (array)->freelist = *((typeof((array)->freelist) *) _ret);\ | ||
280 | \ | ||
281 | _ret; \ | ||
282 | }) | ||
283 | |||
284 | #define array_free(array, ptr) \ | ||
285 | do { \ | ||
286 | typeof((array)->freelist) _ptr = ptr; \ | ||
287 | \ | ||
288 | *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \ | ||
289 | (array)->freelist = _ptr; \ | ||
290 | } while (0) | ||
291 | |||
292 | #define array_allocator_init(array) \ | ||
293 | do { \ | ||
294 | typeof((array)->freelist) _i; \ | ||
295 | \ | ||
296 | BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \ | ||
297 | (array)->freelist = NULL; \ | ||
298 | \ | ||
299 | for (_i = (array)->data; \ | ||
300 | _i < (array)->data + ARRAY_SIZE((array)->data); \ | ||
301 | _i++) \ | ||
302 | array_free(array, _i); \ | ||
303 | } while (0) | ||
304 | |||
305 | #define array_freelist_empty(array) ((array)->freelist == NULL) | ||
306 | |||
307 | #define ANYSINT_MAX(t) \ | ||
308 | ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) | ||
309 | |||
310 | int bch_strtoint_h(const char *, int *); | ||
311 | int bch_strtouint_h(const char *, unsigned int *); | ||
312 | int bch_strtoll_h(const char *, long long *); | ||
313 | int bch_strtoull_h(const char *, unsigned long long *); | ||
314 | |||
315 | static inline int bch_strtol_h(const char *cp, long *res) | ||
316 | { | ||
317 | #if BITS_PER_LONG == 32 | ||
318 | return bch_strtoint_h(cp, (int *) res); | ||
319 | #else | ||
320 | return bch_strtoll_h(cp, (long long *) res); | ||
321 | #endif | ||
322 | } | ||
323 | |||
324 | static inline int bch_strtoul_h(const char *cp, long *res) | ||
325 | { | ||
326 | #if BITS_PER_LONG == 32 | ||
327 | return bch_strtouint_h(cp, (unsigned int *) res); | ||
328 | #else | ||
329 | return bch_strtoull_h(cp, (unsigned long long *) res); | ||
330 | #endif | ||
331 | } | ||
332 | |||
333 | #define strtoi_h(cp, res) \ | ||
334 | (__builtin_types_compatible_p(typeof(*res), int) \ | ||
335 | ? bch_strtoint_h(cp, (void *) res) \ | ||
336 | : __builtin_types_compatible_p(typeof(*res), long) \ | ||
337 | ? bch_strtol_h(cp, (void *) res) \ | ||
338 | : __builtin_types_compatible_p(typeof(*res), long long) \ | ||
339 | ? bch_strtoll_h(cp, (void *) res) \ | ||
340 | : __builtin_types_compatible_p(typeof(*res), unsigned int) \ | ||
341 | ? bch_strtouint_h(cp, (void *) res) \ | ||
342 | : __builtin_types_compatible_p(typeof(*res), unsigned long) \ | ||
343 | ? bch_strtoul_h(cp, (void *) res) \ | ||
344 | : __builtin_types_compatible_p(typeof(*res), unsigned long long)\ | ||
345 | ? bch_strtoull_h(cp, (void *) res) : -EINVAL) | ||
346 | |||
347 | #define strtoul_safe(cp, var) \ | ||
348 | ({ \ | ||
349 | unsigned long _v; \ | ||
350 | int _r = kstrtoul(cp, 10, &_v); \ | ||
351 | if (!_r) \ | ||
352 | var = _v; \ | ||
353 | _r; \ | ||
354 | }) | ||
355 | |||
356 | #define strtoul_safe_clamp(cp, var, min, max) \ | ||
357 | ({ \ | ||
358 | unsigned long _v; \ | ||
359 | int _r = kstrtoul(cp, 10, &_v); \ | ||
360 | if (!_r) \ | ||
361 | var = clamp_t(typeof(var), _v, min, max); \ | ||
362 | _r; \ | ||
363 | }) | ||
364 | |||
365 | #define snprint(buf, size, var) \ | ||
366 | snprintf(buf, size, \ | ||
367 | __builtin_types_compatible_p(typeof(var), int) \ | ||
368 | ? "%i\n" : \ | ||
369 | __builtin_types_compatible_p(typeof(var), unsigned) \ | ||
370 | ? "%u\n" : \ | ||
371 | __builtin_types_compatible_p(typeof(var), long) \ | ||
372 | ? "%li\n" : \ | ||
373 | __builtin_types_compatible_p(typeof(var), unsigned long)\ | ||
374 | ? "%lu\n" : \ | ||
375 | __builtin_types_compatible_p(typeof(var), int64_t) \ | ||
376 | ? "%lli\n" : \ | ||
377 | __builtin_types_compatible_p(typeof(var), uint64_t) \ | ||
378 | ? "%llu\n" : \ | ||
379 | __builtin_types_compatible_p(typeof(var), const char *) \ | ||
380 | ? "%s\n" : "%i\n", var) | ||
381 | |||
382 | ssize_t bch_hprint(char *buf, int64_t v); | ||
383 | |||
384 | bool bch_is_zero(const char *p, size_t n); | ||
385 | int bch_parse_uuid(const char *s, char *uuid); | ||
386 | |||
387 | ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], | ||
388 | size_t selected); | ||
389 | |||
390 | ssize_t bch_read_string_list(const char *buf, const char * const list[]); | ||
391 | |||
392 | struct time_stats { | ||
393 | /* | ||
394 | * all fields are in nanoseconds, averages are ewmas stored left shifted | ||
395 | * by 8 | ||
396 | */ | ||
397 | uint64_t max_duration; | ||
398 | uint64_t average_duration; | ||
399 | uint64_t average_frequency; | ||
400 | uint64_t last; | ||
401 | }; | ||
402 | |||
403 | void bch_time_stats_update(struct time_stats *stats, uint64_t time); | ||
404 | |||
405 | #define NSEC_PER_ns 1L | ||
406 | #define NSEC_PER_us NSEC_PER_USEC | ||
407 | #define NSEC_PER_ms NSEC_PER_MSEC | ||
408 | #define NSEC_PER_sec NSEC_PER_SEC | ||
409 | |||
410 | #define __print_time_stat(stats, name, stat, units) \ | ||
411 | sysfs_print(name ## _ ## stat ## _ ## units, \ | ||
412 | div_u64((stats)->stat >> 8, NSEC_PER_ ## units)) | ||
413 | |||
414 | #define sysfs_print_time_stats(stats, name, \ | ||
415 | frequency_units, \ | ||
416 | duration_units) \ | ||
417 | do { \ | ||
418 | __print_time_stat(stats, name, \ | ||
419 | average_frequency, frequency_units); \ | ||
420 | __print_time_stat(stats, name, \ | ||
421 | average_duration, duration_units); \ | ||
422 | __print_time_stat(stats, name, \ | ||
423 | max_duration, duration_units); \ | ||
424 | \ | ||
425 | sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ | ||
426 | ? div_s64(local_clock() - (stats)->last, \ | ||
427 | NSEC_PER_ ## frequency_units) \ | ||
428 | : -1LL); \ | ||
429 | } while (0) | ||
430 | |||
431 | #define sysfs_time_stats_attribute(name, \ | ||
432 | frequency_units, \ | ||
433 | duration_units) \ | ||
434 | read_attribute(name ## _average_frequency_ ## frequency_units); \ | ||
435 | read_attribute(name ## _average_duration_ ## duration_units); \ | ||
436 | read_attribute(name ## _max_duration_ ## duration_units); \ | ||
437 | read_attribute(name ## _last_ ## frequency_units) | ||
438 | |||
439 | #define sysfs_time_stats_attribute_list(name, \ | ||
440 | frequency_units, \ | ||
441 | duration_units) \ | ||
442 | &sysfs_ ## name ## _average_frequency_ ## frequency_units, \ | ||
443 | &sysfs_ ## name ## _average_duration_ ## duration_units, \ | ||
444 | &sysfs_ ## name ## _max_duration_ ## duration_units, \ | ||
445 | &sysfs_ ## name ## _last_ ## frequency_units, | ||
446 | |||
447 | #define ewma_add(ewma, val, weight, factor) \ | ||
448 | ({ \ | ||
449 | (ewma) *= (weight) - 1; \ | ||
450 | (ewma) += (val) << factor; \ | ||
451 | (ewma) /= (weight); \ | ||
452 | (ewma) >> factor; \ | ||
453 | }) | ||
454 | |||
455 | struct ratelimit { | ||
456 | uint64_t next; | ||
457 | unsigned rate; | ||
458 | }; | ||
459 | |||
460 | static inline void ratelimit_reset(struct ratelimit *d) | ||
461 | { | ||
462 | d->next = local_clock(); | ||
463 | } | ||
464 | |||
465 | unsigned bch_next_delay(struct ratelimit *d, uint64_t done); | ||
466 | |||
467 | #define __DIV_SAFE(n, d, zero) \ | ||
468 | ({ \ | ||
469 | typeof(n) _n = (n); \ | ||
470 | typeof(d) _d = (d); \ | ||
471 | _d ? _n / _d : zero; \ | ||
472 | }) | ||
473 | |||
474 | #define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0) | ||
475 | |||
476 | #define container_of_or_null(ptr, type, member) \ | ||
477 | ({ \ | ||
478 | typeof(ptr) _ptr = ptr; \ | ||
479 | _ptr ? container_of(_ptr, type, member) : NULL; \ | ||
480 | }) | ||
481 | |||
482 | #define RB_INSERT(root, new, member, cmp) \ | ||
483 | ({ \ | ||
484 | __label__ dup; \ | ||
485 | struct rb_node **n = &(root)->rb_node, *parent = NULL; \ | ||
486 | typeof(new) this; \ | ||
487 | int res, ret = -1; \ | ||
488 | \ | ||
489 | while (*n) { \ | ||
490 | parent = *n; \ | ||
491 | this = container_of(*n, typeof(*(new)), member); \ | ||
492 | res = cmp(new, this); \ | ||
493 | if (!res) \ | ||
494 | goto dup; \ | ||
495 | n = res < 0 \ | ||
496 | ? &(*n)->rb_left \ | ||
497 | : &(*n)->rb_right; \ | ||
498 | } \ | ||
499 | \ | ||
500 | rb_link_node(&(new)->member, parent, n); \ | ||
501 | rb_insert_color(&(new)->member, root); \ | ||
502 | ret = 0; \ | ||
503 | dup: \ | ||
504 | ret; \ | ||
505 | }) | ||
506 | |||
507 | #define RB_SEARCH(root, search, member, cmp) \ | ||
508 | ({ \ | ||
509 | struct rb_node *n = (root)->rb_node; \ | ||
510 | typeof(&(search)) this, ret = NULL; \ | ||
511 | int res; \ | ||
512 | \ | ||
513 | while (n) { \ | ||
514 | this = container_of(n, typeof(search), member); \ | ||
515 | res = cmp(&(search), this); \ | ||
516 | if (!res) { \ | ||
517 | ret = this; \ | ||
518 | break; \ | ||
519 | } \ | ||
520 | n = res < 0 \ | ||
521 | ? n->rb_left \ | ||
522 | : n->rb_right; \ | ||
523 | } \ | ||
524 | ret; \ | ||
525 | }) | ||
526 | |||
527 | #define RB_GREATER(root, search, member, cmp) \ | ||
528 | ({ \ | ||
529 | struct rb_node *n = (root)->rb_node; \ | ||
530 | typeof(&(search)) this, ret = NULL; \ | ||
531 | int res; \ | ||
532 | \ | ||
533 | while (n) { \ | ||
534 | this = container_of(n, typeof(search), member); \ | ||
535 | res = cmp(&(search), this); \ | ||
536 | if (res < 0) { \ | ||
537 | ret = this; \ | ||
538 | n = n->rb_left; \ | ||
539 | } else \ | ||
540 | n = n->rb_right; \ | ||
541 | } \ | ||
542 | ret; \ | ||
543 | }) | ||
544 | |||
545 | #define RB_FIRST(root, type, member) \ | ||
546 | container_of_or_null(rb_first(root), type, member) | ||
547 | |||
548 | #define RB_LAST(root, type, member) \ | ||
549 | container_of_or_null(rb_last(root), type, member) | ||
550 | |||
551 | #define RB_NEXT(ptr, member) \ | ||
552 | container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member) | ||
553 | |||
554 | #define RB_PREV(ptr, member) \ | ||
555 | container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member) | ||
556 | |||
557 | /* Does linear interpolation between powers of two */ | ||
558 | static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) | ||
559 | { | ||
560 | unsigned fract = x & ~(~0 << fract_bits); | ||
561 | |||
562 | x >>= fract_bits; | ||
563 | x = 1 << x; | ||
564 | x += (x * fract) >> fract_bits; | ||
565 | |||
566 | return x; | ||
567 | } | ||
568 | |||
569 | #define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio)) | ||
570 | |||
571 | void bch_bio_map(struct bio *bio, void *base); | ||
572 | |||
573 | int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp); | ||
574 | |||
575 | static inline sector_t bdev_sectors(struct block_device *bdev) | ||
576 | { | ||
577 | return bdev->bd_inode->i_size >> 9; | ||
578 | } | ||
579 | |||
580 | #define closure_bio_submit(bio, cl, dev) \ | ||
581 | do { \ | ||
582 | closure_get(cl); \ | ||
583 | bch_generic_make_request(bio, &(dev)->bio_split_hook); \ | ||
584 | } while (0) | ||
585 | |||
586 | uint64_t bch_crc64_update(uint64_t, const void *, size_t); | ||
587 | uint64_t bch_crc64(const void *, size_t); | ||
588 | |||
589 | #endif /* _BCACHE_UTIL_H */ | ||
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c new file mode 100644 index 000000000000..93e7e31a4bd3 --- /dev/null +++ b/drivers/md/bcache/writeback.c | |||
@@ -0,0 +1,414 @@ | |||
1 | /* | ||
2 | * background writeback - scan btree for dirty data and write it to the backing | ||
3 | * device | ||
4 | * | ||
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> | ||
6 | * Copyright 2012 Google, Inc. | ||
7 | */ | ||
8 | |||
9 | #include "bcache.h" | ||
10 | #include "btree.h" | ||
11 | #include "debug.h" | ||
12 | |||
13 | static struct workqueue_struct *dirty_wq; | ||
14 | |||
15 | static void read_dirty(struct closure *); | ||
16 | |||
17 | struct dirty_io { | ||
18 | struct closure cl; | ||
19 | struct cached_dev *dc; | ||
20 | struct bio bio; | ||
21 | }; | ||
22 | |||
23 | /* Rate limiting */ | ||
24 | |||
25 | static void __update_writeback_rate(struct cached_dev *dc) | ||
26 | { | ||
27 | struct cache_set *c = dc->disk.c; | ||
28 | uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; | ||
29 | uint64_t cache_dirty_target = | ||
30 | div_u64(cache_sectors * dc->writeback_percent, 100); | ||
31 | |||
32 | int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), | ||
33 | c->cached_dev_sectors); | ||
34 | |||
35 | /* PD controller */ | ||
36 | |||
37 | int change = 0; | ||
38 | int64_t error; | ||
39 | int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty); | ||
40 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; | ||
41 | |||
42 | dc->disk.sectors_dirty_last = dirty; | ||
43 | |||
44 | derivative *= dc->writeback_rate_d_term; | ||
45 | derivative = clamp(derivative, -dirty, dirty); | ||
46 | |||
47 | derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, | ||
48 | dc->writeback_rate_d_smooth, 0); | ||
49 | |||
50 | /* Avoid divide by zero */ | ||
51 | if (!target) | ||
52 | goto out; | ||
53 | |||
54 | error = div64_s64((dirty + derivative - target) << 8, target); | ||
55 | |||
56 | change = div_s64((dc->writeback_rate.rate * error) >> 8, | ||
57 | dc->writeback_rate_p_term_inverse); | ||
58 | |||
59 | /* Don't increase writeback rate if the device isn't keeping up */ | ||
60 | if (change > 0 && | ||
61 | time_after64(local_clock(), | ||
62 | dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) | ||
63 | change = 0; | ||
64 | |||
65 | dc->writeback_rate.rate = | ||
66 | clamp_t(int64_t, dc->writeback_rate.rate + change, | ||
67 | 1, NSEC_PER_MSEC); | ||
68 | out: | ||
69 | dc->writeback_rate_derivative = derivative; | ||
70 | dc->writeback_rate_change = change; | ||
71 | dc->writeback_rate_target = target; | ||
72 | |||
73 | schedule_delayed_work(&dc->writeback_rate_update, | ||
74 | dc->writeback_rate_update_seconds * HZ); | ||
75 | } | ||
76 | |||
77 | static void update_writeback_rate(struct work_struct *work) | ||
78 | { | ||
79 | struct cached_dev *dc = container_of(to_delayed_work(work), | ||
80 | struct cached_dev, | ||
81 | writeback_rate_update); | ||
82 | |||
83 | down_read(&dc->writeback_lock); | ||
84 | |||
85 | if (atomic_read(&dc->has_dirty) && | ||
86 | dc->writeback_percent) | ||
87 | __update_writeback_rate(dc); | ||
88 | |||
89 | up_read(&dc->writeback_lock); | ||
90 | } | ||
91 | |||
92 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | ||
93 | { | ||
94 | if (atomic_read(&dc->disk.detaching) || | ||
95 | !dc->writeback_percent) | ||
96 | return 0; | ||
97 | |||
98 | return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); | ||
99 | } | ||
100 | |||
101 | /* Background writeback */ | ||
102 | |||
103 | static bool dirty_pred(struct keybuf *buf, struct bkey *k) | ||
104 | { | ||
105 | return KEY_DIRTY(k); | ||
106 | } | ||
107 | |||
108 | static void dirty_init(struct keybuf_key *w) | ||
109 | { | ||
110 | struct dirty_io *io = w->private; | ||
111 | struct bio *bio = &io->bio; | ||
112 | |||
113 | bio_init(bio); | ||
114 | if (!io->dc->writeback_percent) | ||
115 | bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | ||
116 | |||
117 | bio->bi_size = KEY_SIZE(&w->key) << 9; | ||
118 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); | ||
119 | bio->bi_private = w; | ||
120 | bio->bi_io_vec = bio->bi_inline_vecs; | ||
121 | bch_bio_map(bio, NULL); | ||
122 | } | ||
123 | |||
124 | static void refill_dirty(struct closure *cl) | ||
125 | { | ||
126 | struct cached_dev *dc = container_of(cl, struct cached_dev, | ||
127 | writeback.cl); | ||
128 | struct keybuf *buf = &dc->writeback_keys; | ||
129 | bool searched_from_start = false; | ||
130 | struct bkey end = MAX_KEY; | ||
131 | SET_KEY_INODE(&end, dc->disk.id); | ||
132 | |||
133 | if (!atomic_read(&dc->disk.detaching) && | ||
134 | !dc->writeback_running) | ||
135 | closure_return(cl); | ||
136 | |||
137 | down_write(&dc->writeback_lock); | ||
138 | |||
139 | if (!atomic_read(&dc->has_dirty)) { | ||
140 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); | ||
141 | bch_write_bdev_super(dc, NULL); | ||
142 | |||
143 | up_write(&dc->writeback_lock); | ||
144 | closure_return(cl); | ||
145 | } | ||
146 | |||
147 | if (bkey_cmp(&buf->last_scanned, &end) >= 0) { | ||
148 | buf->last_scanned = KEY(dc->disk.id, 0, 0); | ||
149 | searched_from_start = true; | ||
150 | } | ||
151 | |||
152 | bch_refill_keybuf(dc->disk.c, buf, &end); | ||
153 | |||
154 | if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { | ||
155 | /* Searched the entire btree - delay awhile */ | ||
156 | |||
157 | if (RB_EMPTY_ROOT(&buf->keys)) { | ||
158 | atomic_set(&dc->has_dirty, 0); | ||
159 | cached_dev_put(dc); | ||
160 | } | ||
161 | |||
162 | if (!atomic_read(&dc->disk.detaching)) | ||
163 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); | ||
164 | } | ||
165 | |||
166 | up_write(&dc->writeback_lock); | ||
167 | |||
168 | ratelimit_reset(&dc->writeback_rate); | ||
169 | |||
170 | /* Punt to workqueue only so we don't recurse and blow the stack */ | ||
171 | continue_at(cl, read_dirty, dirty_wq); | ||
172 | } | ||
173 | |||
174 | void bch_writeback_queue(struct cached_dev *dc) | ||
175 | { | ||
176 | if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { | ||
177 | if (!atomic_read(&dc->disk.detaching)) | ||
178 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); | ||
179 | |||
180 | continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); | ||
181 | } | ||
182 | } | ||
183 | |||
184 | void bch_writeback_add(struct cached_dev *dc, unsigned sectors) | ||
185 | { | ||
186 | atomic_long_add(sectors, &dc->disk.sectors_dirty); | ||
187 | |||
188 | if (!atomic_read(&dc->has_dirty) && | ||
189 | !atomic_xchg(&dc->has_dirty, 1)) { | ||
190 | atomic_inc(&dc->count); | ||
191 | |||
192 | if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { | ||
193 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); | ||
194 | /* XXX: should do this synchronously */ | ||
195 | bch_write_bdev_super(dc, NULL); | ||
196 | } | ||
197 | |||
198 | bch_writeback_queue(dc); | ||
199 | |||
200 | if (dc->writeback_percent) | ||
201 | schedule_delayed_work(&dc->writeback_rate_update, | ||
202 | dc->writeback_rate_update_seconds * HZ); | ||
203 | } | ||
204 | } | ||
205 | |||
206 | /* Background writeback - IO loop */ | ||
207 | |||
208 | static void dirty_io_destructor(struct closure *cl) | ||
209 | { | ||
210 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | ||
211 | kfree(io); | ||
212 | } | ||
213 | |||
214 | static void write_dirty_finish(struct closure *cl) | ||
215 | { | ||
216 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | ||
217 | struct keybuf_key *w = io->bio.bi_private; | ||
218 | struct cached_dev *dc = io->dc; | ||
219 | struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); | ||
220 | |||
221 | while (bv-- != io->bio.bi_io_vec) | ||
222 | __free_page(bv->bv_page); | ||
223 | |||
224 | /* This is kind of a dumb way of signalling errors. */ | ||
225 | if (KEY_DIRTY(&w->key)) { | ||
226 | unsigned i; | ||
227 | struct btree_op op; | ||
228 | bch_btree_op_init_stack(&op); | ||
229 | |||
230 | op.type = BTREE_REPLACE; | ||
231 | bkey_copy(&op.replace, &w->key); | ||
232 | |||
233 | SET_KEY_DIRTY(&w->key, false); | ||
234 | bch_keylist_add(&op.keys, &w->key); | ||
235 | |||
236 | for (i = 0; i < KEY_PTRS(&w->key); i++) | ||
237 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); | ||
238 | |||
239 | pr_debug("clearing %s", pkey(&w->key)); | ||
240 | bch_btree_insert(&op, dc->disk.c); | ||
241 | closure_sync(&op.cl); | ||
242 | |||
243 | atomic_long_inc(op.insert_collision | ||
244 | ? &dc->disk.c->writeback_keys_failed | ||
245 | : &dc->disk.c->writeback_keys_done); | ||
246 | } | ||
247 | |||
248 | bch_keybuf_del(&dc->writeback_keys, w); | ||
249 | atomic_dec_bug(&dc->in_flight); | ||
250 | |||
251 | closure_wake_up(&dc->writeback_wait); | ||
252 | |||
253 | closure_return_with_destructor(cl, dirty_io_destructor); | ||
254 | } | ||
255 | |||
256 | static void dirty_endio(struct bio *bio, int error) | ||
257 | { | ||
258 | struct keybuf_key *w = bio->bi_private; | ||
259 | struct dirty_io *io = w->private; | ||
260 | |||
261 | if (error) | ||
262 | SET_KEY_DIRTY(&w->key, false); | ||
263 | |||
264 | closure_put(&io->cl); | ||
265 | } | ||
266 | |||
267 | static void write_dirty(struct closure *cl) | ||
268 | { | ||
269 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | ||
270 | struct keybuf_key *w = io->bio.bi_private; | ||
271 | |||
272 | dirty_init(w); | ||
273 | io->bio.bi_rw = WRITE; | ||
274 | io->bio.bi_sector = KEY_START(&w->key); | ||
275 | io->bio.bi_bdev = io->dc->bdev; | ||
276 | io->bio.bi_end_io = dirty_endio; | ||
277 | |||
278 | trace_bcache_write_dirty(&io->bio); | ||
279 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | ||
280 | |||
281 | continue_at(cl, write_dirty_finish, dirty_wq); | ||
282 | } | ||
283 | |||
284 | static void read_dirty_endio(struct bio *bio, int error) | ||
285 | { | ||
286 | struct keybuf_key *w = bio->bi_private; | ||
287 | struct dirty_io *io = w->private; | ||
288 | |||
289 | bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), | ||
290 | error, "reading dirty data from cache"); | ||
291 | |||
292 | dirty_endio(bio, error); | ||
293 | } | ||
294 | |||
295 | static void read_dirty_submit(struct closure *cl) | ||
296 | { | ||
297 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | ||
298 | |||
299 | trace_bcache_read_dirty(&io->bio); | ||
300 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | ||
301 | |||
302 | continue_at(cl, write_dirty, dirty_wq); | ||
303 | } | ||
304 | |||
305 | static void read_dirty(struct closure *cl) | ||
306 | { | ||
307 | struct cached_dev *dc = container_of(cl, struct cached_dev, | ||
308 | writeback.cl); | ||
309 | unsigned delay = writeback_delay(dc, 0); | ||
310 | struct keybuf_key *w; | ||
311 | struct dirty_io *io; | ||
312 | |||
313 | /* | ||
314 | * XXX: if we error, background writeback just spins. Should use some | ||
315 | * mempools. | ||
316 | */ | ||
317 | |||
318 | while (1) { | ||
319 | w = bch_keybuf_next(&dc->writeback_keys); | ||
320 | if (!w) | ||
321 | break; | ||
322 | |||
323 | BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); | ||
324 | |||
325 | if (delay > 0 && | ||
326 | (KEY_START(&w->key) != dc->last_read || | ||
327 | jiffies_to_msecs(delay) > 50)) { | ||
328 | w->private = NULL; | ||
329 | |||
330 | closure_delay(&dc->writeback, delay); | ||
331 | continue_at(cl, read_dirty, dirty_wq); | ||
332 | } | ||
333 | |||
334 | dc->last_read = KEY_OFFSET(&w->key); | ||
335 | |||
336 | io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) | ||
337 | * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), | ||
338 | GFP_KERNEL); | ||
339 | if (!io) | ||
340 | goto err; | ||
341 | |||
342 | w->private = io; | ||
343 | io->dc = dc; | ||
344 | |||
345 | dirty_init(w); | ||
346 | io->bio.bi_sector = PTR_OFFSET(&w->key, 0); | ||
347 | io->bio.bi_bdev = PTR_CACHE(dc->disk.c, | ||
348 | &w->key, 0)->bdev; | ||
349 | io->bio.bi_rw = READ; | ||
350 | io->bio.bi_end_io = read_dirty_endio; | ||
351 | |||
352 | if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) | ||
353 | goto err_free; | ||
354 | |||
355 | pr_debug("%s", pkey(&w->key)); | ||
356 | |||
357 | closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); | ||
358 | |||
359 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); | ||
360 | |||
361 | atomic_inc(&dc->in_flight); | ||
362 | |||
363 | if (!closure_wait_event(&dc->writeback_wait, cl, | ||
364 | atomic_read(&dc->in_flight) < 64)) | ||
365 | continue_at(cl, read_dirty, dirty_wq); | ||
366 | } | ||
367 | |||
368 | if (0) { | ||
369 | err_free: | ||
370 | kfree(w->private); | ||
371 | err: | ||
372 | bch_keybuf_del(&dc->writeback_keys, w); | ||
373 | } | ||
374 | |||
375 | refill_dirty(cl); | ||
376 | } | ||
377 | |||
378 | void bch_writeback_init_cached_dev(struct cached_dev *dc) | ||
379 | { | ||
380 | closure_init_unlocked(&dc->writeback); | ||
381 | init_rwsem(&dc->writeback_lock); | ||
382 | |||
383 | bch_keybuf_init(&dc->writeback_keys, dirty_pred); | ||
384 | |||
385 | dc->writeback_metadata = true; | ||
386 | dc->writeback_running = true; | ||
387 | dc->writeback_percent = 10; | ||
388 | dc->writeback_delay = 30; | ||
389 | dc->writeback_rate.rate = 1024; | ||
390 | |||
391 | dc->writeback_rate_update_seconds = 30; | ||
392 | dc->writeback_rate_d_term = 16; | ||
393 | dc->writeback_rate_p_term_inverse = 64; | ||
394 | dc->writeback_rate_d_smooth = 8; | ||
395 | |||
396 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); | ||
397 | schedule_delayed_work(&dc->writeback_rate_update, | ||
398 | dc->writeback_rate_update_seconds * HZ); | ||
399 | } | ||
400 | |||
401 | void bch_writeback_exit(void) | ||
402 | { | ||
403 | if (dirty_wq) | ||
404 | destroy_workqueue(dirty_wq); | ||
405 | } | ||
406 | |||
407 | int __init bch_writeback_init(void) | ||
408 | { | ||
409 | dirty_wq = create_singlethread_workqueue("bcache_writeback"); | ||
410 | if (!dirty_wq) | ||
411 | return -ENOMEM; | ||
412 | |||
413 | return 0; | ||
414 | } | ||
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index f204a7a9cf38..6e7ec64b69ab 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h | |||
@@ -78,3 +78,9 @@ SUBSYS(hugetlb) | |||
78 | #endif | 78 | #endif |
79 | 79 | ||
80 | /* */ | 80 | /* */ |
81 | |||
82 | #ifdef CONFIG_CGROUP_BCACHE | ||
83 | SUBSYS(bcache) | ||
84 | #endif | ||
85 | |||
86 | /* */ | ||
diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 0c5a18ec322c..1b4d4ee1168f 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h | |||
@@ -52,7 +52,7 @@ | |||
52 | #endif | 52 | #endif |
53 | 53 | ||
54 | extern const char *drbd_buildtag(void); | 54 | extern const char *drbd_buildtag(void); |
55 | #define REL_VERSION "8.4.2" | 55 | #define REL_VERSION "8.4.3" |
56 | #define API_VERSION 1 | 56 | #define API_VERSION 1 |
57 | #define PRO_VERSION_MIN 86 | 57 | #define PRO_VERSION_MIN 86 |
58 | #define PRO_VERSION_MAX 101 | 58 | #define PRO_VERSION_MAX 101 |
@@ -319,7 +319,8 @@ enum drbd_state_rv { | |||
319 | SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ | 319 | SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ |
320 | SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ | 320 | SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ |
321 | SS_O_VOL_PEER_PRI = -20, | 321 | SS_O_VOL_PEER_PRI = -20, |
322 | SS_AFTER_LAST_ERROR = -21, /* Keep this at bottom */ | 322 | SS_OUTDATE_WO_CONN = -21, |
323 | SS_AFTER_LAST_ERROR = -22, /* Keep this at bottom */ | ||
323 | }; | 324 | }; |
324 | 325 | ||
325 | /* from drbd_strings.c */ | 326 | /* from drbd_strings.c */ |
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 1fa19c5f5e64..1fedf2b17cc8 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h | |||
@@ -126,13 +126,12 @@ | |||
126 | #define DRBD_RESYNC_RATE_DEF 250 | 126 | #define DRBD_RESYNC_RATE_DEF 250 |
127 | #define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */ | 127 | #define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */ |
128 | 128 | ||
129 | /* less than 7 would hit performance unnecessarily. | 129 | /* less than 7 would hit performance unnecessarily. */ |
130 | * 919 slots context information per transaction, | ||
131 | * 32k activity log, 4k transaction size, | ||
132 | * one transaction in flight: | ||
133 | * 919 * 7 = 6433 */ | ||
134 | #define DRBD_AL_EXTENTS_MIN 7 | 130 | #define DRBD_AL_EXTENTS_MIN 7 |
135 | #define DRBD_AL_EXTENTS_MAX 6433 | 131 | /* we use u16 as "slot number", (u16)~0 is "FREE". |
132 | * If you use >= 292 kB on-disk ring buffer, | ||
133 | * this is the maximum you can use: */ | ||
134 | #define DRBD_AL_EXTENTS_MAX 0xfffe | ||
136 | #define DRBD_AL_EXTENTS_DEF 1237 | 135 | #define DRBD_AL_EXTENTS_DEF 1237 |
137 | #define DRBD_AL_EXTENTS_SCALE '1' | 136 | #define DRBD_AL_EXTENTS_SCALE '1' |
138 | 137 | ||
diff --git a/include/linux/idr.h b/include/linux/idr.h index a470ac3ef49d..871a213a8477 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h | |||
@@ -124,11 +124,13 @@ static inline void *idr_find(struct idr *idr, int id) | |||
124 | * @idp: idr handle | 124 | * @idp: idr handle |
125 | * @entry: the type * to use as cursor | 125 | * @entry: the type * to use as cursor |
126 | * @id: id entry's key | 126 | * @id: id entry's key |
127 | * | ||
128 | * @entry and @id do not need to be initialized before the loop, and | ||
129 | * after normal terminatinon @entry is left with the value NULL. This | ||
130 | * is convenient for a "not found" value. | ||
127 | */ | 131 | */ |
128 | #define idr_for_each_entry(idp, entry, id) \ | 132 | #define idr_for_each_entry(idp, entry, id) \ |
129 | for (id = 0, entry = (typeof(entry))idr_get_next((idp), &(id)); \ | 133 | for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id) |
130 | entry != NULL; \ | ||
131 | ++id, entry = (typeof(entry))idr_get_next((idp), &(id))) | ||
132 | 134 | ||
133 | /* | 135 | /* |
134 | * Don't use the following functions. These exist only to suppress | 136 | * Don't use the following functions. These exist only to suppress |
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h index 4019013c6593..46262284de47 100644 --- a/include/linux/lru_cache.h +++ b/include/linux/lru_cache.h | |||
@@ -256,6 +256,7 @@ extern void lc_destroy(struct lru_cache *lc); | |||
256 | extern void lc_set(struct lru_cache *lc, unsigned int enr, int index); | 256 | extern void lc_set(struct lru_cache *lc, unsigned int enr, int index); |
257 | extern void lc_del(struct lru_cache *lc, struct lc_element *element); | 257 | extern void lc_del(struct lru_cache *lc, struct lc_element *element); |
258 | 258 | ||
259 | extern struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr); | ||
259 | extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr); | 260 | extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr); |
260 | extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); | 261 | extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); |
261 | extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); | 262 | extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); |
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 8da67d625e13..0616ffe45702 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h | |||
@@ -133,10 +133,20 @@ do { \ | |||
133 | _down_write_nest_lock(sem, &(nest_lock)->dep_map); \ | 133 | _down_write_nest_lock(sem, &(nest_lock)->dep_map); \ |
134 | } while (0); | 134 | } while (0); |
135 | 135 | ||
136 | /* | ||
137 | * Take/release a lock when not the owner will release it. | ||
138 | * | ||
139 | * [ This API should be avoided as much as possible - the | ||
140 | * proper abstraction for this case is completions. ] | ||
141 | */ | ||
142 | extern void down_read_non_owner(struct rw_semaphore *sem); | ||
143 | extern void up_read_non_owner(struct rw_semaphore *sem); | ||
136 | #else | 144 | #else |
137 | # define down_read_nested(sem, subclass) down_read(sem) | 145 | # define down_read_nested(sem, subclass) down_read(sem) |
138 | # define down_write_nest_lock(sem, nest_lock) down_write(sem) | 146 | # define down_write_nest_lock(sem, nest_lock) down_write(sem) |
139 | # define down_write_nested(sem, subclass) down_write(sem) | 147 | # define down_write_nested(sem, subclass) down_write(sem) |
148 | # define down_read_non_owner(sem) down_read(sem) | ||
149 | # define up_read_non_owner(sem) up_read(sem) | ||
140 | #endif | 150 | #endif |
141 | 151 | ||
142 | #endif /* _LINUX_RWSEM_H */ | 152 | #endif /* _LINUX_RWSEM_H */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 022c085ac3c5..caa8f4d0186b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1411,6 +1411,10 @@ struct task_struct { | |||
1411 | #ifdef CONFIG_UPROBES | 1411 | #ifdef CONFIG_UPROBES |
1412 | struct uprobe_task *utask; | 1412 | struct uprobe_task *utask; |
1413 | #endif | 1413 | #endif |
1414 | #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) | ||
1415 | unsigned int sequential_io; | ||
1416 | unsigned int sequential_io_avg; | ||
1417 | #endif | ||
1414 | }; | 1418 | }; |
1415 | 1419 | ||
1416 | /* Future-safe accessor for struct task_struct's cpus_allowed. */ | 1420 | /* Future-safe accessor for struct task_struct's cpus_allowed. */ |
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h new file mode 100644 index 000000000000..3cc5a0b278c3 --- /dev/null +++ b/include/trace/events/bcache.h | |||
@@ -0,0 +1,271 @@ | |||
1 | #undef TRACE_SYSTEM | ||
2 | #define TRACE_SYSTEM bcache | ||
3 | |||
4 | #if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ) | ||
5 | #define _TRACE_BCACHE_H | ||
6 | |||
7 | #include <linux/tracepoint.h> | ||
8 | |||
9 | struct search; | ||
10 | |||
11 | DECLARE_EVENT_CLASS(bcache_request, | ||
12 | |||
13 | TP_PROTO(struct search *s, struct bio *bio), | ||
14 | |||
15 | TP_ARGS(s, bio), | ||
16 | |||
17 | TP_STRUCT__entry( | ||
18 | __field(dev_t, dev ) | ||
19 | __field(unsigned int, orig_major ) | ||
20 | __field(unsigned int, orig_minor ) | ||
21 | __field(sector_t, sector ) | ||
22 | __field(dev_t, orig_sector ) | ||
23 | __field(unsigned int, nr_sector ) | ||
24 | __array(char, rwbs, 6 ) | ||
25 | __array(char, comm, TASK_COMM_LEN ) | ||
26 | ), | ||
27 | |||
28 | TP_fast_assign( | ||
29 | __entry->dev = bio->bi_bdev->bd_dev; | ||
30 | __entry->orig_major = s->d->disk->major; | ||
31 | __entry->orig_minor = s->d->disk->first_minor; | ||
32 | __entry->sector = bio->bi_sector; | ||
33 | __entry->orig_sector = bio->bi_sector - 16; | ||
34 | __entry->nr_sector = bio->bi_size >> 9; | ||
35 | blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); | ||
36 | memcpy(__entry->comm, current->comm, TASK_COMM_LEN); | ||
37 | ), | ||
38 | |||
39 | TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d @ %llu)", | ||
40 | MAJOR(__entry->dev), MINOR(__entry->dev), | ||
41 | __entry->rwbs, | ||
42 | (unsigned long long)__entry->sector, | ||
43 | __entry->nr_sector, __entry->comm, | ||
44 | __entry->orig_major, __entry->orig_minor, | ||
45 | (unsigned long long)__entry->orig_sector) | ||
46 | ); | ||
47 | |||
48 | DEFINE_EVENT(bcache_request, bcache_request_start, | ||
49 | |||
50 | TP_PROTO(struct search *s, struct bio *bio), | ||
51 | |||
52 | TP_ARGS(s, bio) | ||
53 | ); | ||
54 | |||
55 | DEFINE_EVENT(bcache_request, bcache_request_end, | ||
56 | |||
57 | TP_PROTO(struct search *s, struct bio *bio), | ||
58 | |||
59 | TP_ARGS(s, bio) | ||
60 | ); | ||
61 | |||
62 | DECLARE_EVENT_CLASS(bcache_bio, | ||
63 | |||
64 | TP_PROTO(struct bio *bio), | ||
65 | |||
66 | TP_ARGS(bio), | ||
67 | |||
68 | TP_STRUCT__entry( | ||
69 | __field(dev_t, dev ) | ||
70 | __field(sector_t, sector ) | ||
71 | __field(unsigned int, nr_sector ) | ||
72 | __array(char, rwbs, 6 ) | ||
73 | __array(char, comm, TASK_COMM_LEN ) | ||
74 | ), | ||
75 | |||
76 | TP_fast_assign( | ||
77 | __entry->dev = bio->bi_bdev->bd_dev; | ||
78 | __entry->sector = bio->bi_sector; | ||
79 | __entry->nr_sector = bio->bi_size >> 9; | ||
80 | blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); | ||
81 | memcpy(__entry->comm, current->comm, TASK_COMM_LEN); | ||
82 | ), | ||
83 | |||
84 | TP_printk("%d,%d %s %llu + %u [%s]", | ||
85 | MAJOR(__entry->dev), MINOR(__entry->dev), | ||
86 | __entry->rwbs, | ||
87 | (unsigned long long)__entry->sector, | ||
88 | __entry->nr_sector, __entry->comm) | ||
89 | ); | ||
90 | |||
91 | |||
92 | DEFINE_EVENT(bcache_bio, bcache_passthrough, | ||
93 | |||
94 | TP_PROTO(struct bio *bio), | ||
95 | |||
96 | TP_ARGS(bio) | ||
97 | ); | ||
98 | |||
99 | DEFINE_EVENT(bcache_bio, bcache_cache_hit, | ||
100 | |||
101 | TP_PROTO(struct bio *bio), | ||
102 | |||
103 | TP_ARGS(bio) | ||
104 | ); | ||
105 | |||
106 | DEFINE_EVENT(bcache_bio, bcache_cache_miss, | ||
107 | |||
108 | TP_PROTO(struct bio *bio), | ||
109 | |||
110 | TP_ARGS(bio) | ||
111 | ); | ||
112 | |||
113 | DEFINE_EVENT(bcache_bio, bcache_read_retry, | ||
114 | |||
115 | TP_PROTO(struct bio *bio), | ||
116 | |||
117 | TP_ARGS(bio) | ||
118 | ); | ||
119 | |||
120 | DEFINE_EVENT(bcache_bio, bcache_writethrough, | ||
121 | |||
122 | TP_PROTO(struct bio *bio), | ||
123 | |||
124 | TP_ARGS(bio) | ||
125 | ); | ||
126 | |||
127 | DEFINE_EVENT(bcache_bio, bcache_writeback, | ||
128 | |||
129 | TP_PROTO(struct bio *bio), | ||
130 | |||
131 | TP_ARGS(bio) | ||
132 | ); | ||
133 | |||
134 | DEFINE_EVENT(bcache_bio, bcache_write_skip, | ||
135 | |||
136 | TP_PROTO(struct bio *bio), | ||
137 | |||
138 | TP_ARGS(bio) | ||
139 | ); | ||
140 | |||
141 | DEFINE_EVENT(bcache_bio, bcache_btree_read, | ||
142 | |||
143 | TP_PROTO(struct bio *bio), | ||
144 | |||
145 | TP_ARGS(bio) | ||
146 | ); | ||
147 | |||
148 | DEFINE_EVENT(bcache_bio, bcache_btree_write, | ||
149 | |||
150 | TP_PROTO(struct bio *bio), | ||
151 | |||
152 | TP_ARGS(bio) | ||
153 | ); | ||
154 | |||
155 | DEFINE_EVENT(bcache_bio, bcache_write_dirty, | ||
156 | |||
157 | TP_PROTO(struct bio *bio), | ||
158 | |||
159 | TP_ARGS(bio) | ||
160 | ); | ||
161 | |||
162 | DEFINE_EVENT(bcache_bio, bcache_read_dirty, | ||
163 | |||
164 | TP_PROTO(struct bio *bio), | ||
165 | |||
166 | TP_ARGS(bio) | ||
167 | ); | ||
168 | |||
169 | DEFINE_EVENT(bcache_bio, bcache_write_moving, | ||
170 | |||
171 | TP_PROTO(struct bio *bio), | ||
172 | |||
173 | TP_ARGS(bio) | ||
174 | ); | ||
175 | |||
176 | DEFINE_EVENT(bcache_bio, bcache_read_moving, | ||
177 | |||
178 | TP_PROTO(struct bio *bio), | ||
179 | |||
180 | TP_ARGS(bio) | ||
181 | ); | ||
182 | |||
183 | DEFINE_EVENT(bcache_bio, bcache_journal_write, | ||
184 | |||
185 | TP_PROTO(struct bio *bio), | ||
186 | |||
187 | TP_ARGS(bio) | ||
188 | ); | ||
189 | |||
190 | DECLARE_EVENT_CLASS(bcache_cache_bio, | ||
191 | |||
192 | TP_PROTO(struct bio *bio, | ||
193 | sector_t orig_sector, | ||
194 | struct block_device* orig_bdev), | ||
195 | |||
196 | TP_ARGS(bio, orig_sector, orig_bdev), | ||
197 | |||
198 | TP_STRUCT__entry( | ||
199 | __field(dev_t, dev ) | ||
200 | __field(dev_t, orig_dev ) | ||
201 | __field(sector_t, sector ) | ||
202 | __field(sector_t, orig_sector ) | ||
203 | __field(unsigned int, nr_sector ) | ||
204 | __array(char, rwbs, 6 ) | ||
205 | __array(char, comm, TASK_COMM_LEN ) | ||
206 | ), | ||
207 | |||
208 | TP_fast_assign( | ||
209 | __entry->dev = bio->bi_bdev->bd_dev; | ||
210 | __entry->orig_dev = orig_bdev->bd_dev; | ||
211 | __entry->sector = bio->bi_sector; | ||
212 | __entry->orig_sector = orig_sector; | ||
213 | __entry->nr_sector = bio->bi_size >> 9; | ||
214 | blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); | ||
215 | memcpy(__entry->comm, current->comm, TASK_COMM_LEN); | ||
216 | ), | ||
217 | |||
218 | TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d %llu)", | ||
219 | MAJOR(__entry->dev), MINOR(__entry->dev), | ||
220 | __entry->rwbs, | ||
221 | (unsigned long long)__entry->sector, | ||
222 | __entry->nr_sector, __entry->comm, | ||
223 | MAJOR(__entry->orig_dev), MINOR(__entry->orig_dev), | ||
224 | (unsigned long long)__entry->orig_sector) | ||
225 | ); | ||
226 | |||
227 | DEFINE_EVENT(bcache_cache_bio, bcache_cache_insert, | ||
228 | |||
229 | TP_PROTO(struct bio *bio, | ||
230 | sector_t orig_sector, | ||
231 | struct block_device *orig_bdev), | ||
232 | |||
233 | TP_ARGS(bio, orig_sector, orig_bdev) | ||
234 | ); | ||
235 | |||
236 | DECLARE_EVENT_CLASS(bcache_gc, | ||
237 | |||
238 | TP_PROTO(uint8_t *uuid), | ||
239 | |||
240 | TP_ARGS(uuid), | ||
241 | |||
242 | TP_STRUCT__entry( | ||
243 | __field(uint8_t *, uuid) | ||
244 | ), | ||
245 | |||
246 | TP_fast_assign( | ||
247 | __entry->uuid = uuid; | ||
248 | ), | ||
249 | |||
250 | TP_printk("%pU", __entry->uuid) | ||
251 | ); | ||
252 | |||
253 | |||
254 | DEFINE_EVENT(bcache_gc, bcache_gc_start, | ||
255 | |||
256 | TP_PROTO(uint8_t *uuid), | ||
257 | |||
258 | TP_ARGS(uuid) | ||
259 | ); | ||
260 | |||
261 | DEFINE_EVENT(bcache_gc, bcache_gc_end, | ||
262 | |||
263 | TP_PROTO(uint8_t *uuid), | ||
264 | |||
265 | TP_ARGS(uuid) | ||
266 | ); | ||
267 | |||
268 | #endif /* _TRACE_BCACHE_H */ | ||
269 | |||
270 | /* This part must be outside protection */ | ||
271 | #include <trace/define_trace.h> | ||
diff --git a/kernel/fork.c b/kernel/fork.c index c509cc4a0d53..987b28a1f01b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1304,6 +1304,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1304 | p->memcg_batch.do_batch = 0; | 1304 | p->memcg_batch.do_batch = 0; |
1305 | p->memcg_batch.memcg = NULL; | 1305 | p->memcg_batch.memcg = NULL; |
1306 | #endif | 1306 | #endif |
1307 | #ifdef CONFIG_BCACHE | ||
1308 | p->sequential_io = 0; | ||
1309 | p->sequential_io_avg = 0; | ||
1310 | #endif | ||
1307 | 1311 | ||
1308 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1312 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
1309 | sched_fork(p); | 1313 | sched_fork(p); |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 6a3bccba7e7d..1f3186b37fd5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -2998,6 +2998,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
2998 | EXPORT_SYMBOL_GPL(lockdep_init_map); | 2998 | EXPORT_SYMBOL_GPL(lockdep_init_map); |
2999 | 2999 | ||
3000 | struct lock_class_key __lockdep_no_validate__; | 3000 | struct lock_class_key __lockdep_no_validate__; |
3001 | EXPORT_SYMBOL_GPL(__lockdep_no_validate__); | ||
3001 | 3002 | ||
3002 | static int | 3003 | static int |
3003 | print_lock_nested_lock_not_held(struct task_struct *curr, | 3004 | print_lock_nested_lock_not_held(struct task_struct *curr, |
diff --git a/kernel/rwsem.c b/kernel/rwsem.c index b3c6c3fcd847..cfff1435bdfb 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c | |||
@@ -126,6 +126,15 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) | |||
126 | 126 | ||
127 | EXPORT_SYMBOL(_down_write_nest_lock); | 127 | EXPORT_SYMBOL(_down_write_nest_lock); |
128 | 128 | ||
129 | void down_read_non_owner(struct rw_semaphore *sem) | ||
130 | { | ||
131 | might_sleep(); | ||
132 | |||
133 | __down_read(sem); | ||
134 | } | ||
135 | |||
136 | EXPORT_SYMBOL(down_read_non_owner); | ||
137 | |||
129 | void down_write_nested(struct rw_semaphore *sem, int subclass) | 138 | void down_write_nested(struct rw_semaphore *sem, int subclass) |
130 | { | 139 | { |
131 | might_sleep(); | 140 | might_sleep(); |
@@ -136,6 +145,13 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) | |||
136 | 145 | ||
137 | EXPORT_SYMBOL(down_write_nested); | 146 | EXPORT_SYMBOL(down_write_nested); |
138 | 147 | ||
148 | void up_read_non_owner(struct rw_semaphore *sem) | ||
149 | { | ||
150 | __up_read(sem); | ||
151 | } | ||
152 | |||
153 | EXPORT_SYMBOL(up_read_non_owner); | ||
154 | |||
139 | #endif | 155 | #endif |
140 | 156 | ||
141 | 157 | ||
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ed58a3216a6d..b8b8560bfb95 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -1808,6 +1808,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) | |||
1808 | 1808 | ||
1809 | rwbs[i] = '\0'; | 1809 | rwbs[i] = '\0'; |
1810 | } | 1810 | } |
1811 | EXPORT_SYMBOL_GPL(blk_fill_rwbs); | ||
1811 | 1812 | ||
1812 | #endif /* CONFIG_EVENT_TRACING */ | 1813 | #endif /* CONFIG_EVENT_TRACING */ |
1813 | 1814 | ||
diff --git a/lib/lru_cache.c b/lib/lru_cache.c index 8335d39d2ccd..4a83ecd03650 100644 --- a/lib/lru_cache.c +++ b/lib/lru_cache.c | |||
@@ -365,7 +365,13 @@ static int lc_unused_element_available(struct lru_cache *lc) | |||
365 | return 0; | 365 | return 0; |
366 | } | 366 | } |
367 | 367 | ||
368 | static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change) | 368 | /* used as internal flags to __lc_get */ |
369 | enum { | ||
370 | LC_GET_MAY_CHANGE = 1, | ||
371 | LC_GET_MAY_USE_UNCOMMITTED = 2, | ||
372 | }; | ||
373 | |||
374 | static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, unsigned int flags) | ||
369 | { | 375 | { |
370 | struct lc_element *e; | 376 | struct lc_element *e; |
371 | 377 | ||
@@ -380,22 +386,31 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool | |||
380 | * this enr is currently being pulled in already, | 386 | * this enr is currently being pulled in already, |
381 | * and will be available once the pending transaction | 387 | * and will be available once the pending transaction |
382 | * has been committed. */ | 388 | * has been committed. */ |
383 | if (e && e->lc_new_number == e->lc_number) { | 389 | if (e) { |
390 | if (e->lc_new_number != e->lc_number) { | ||
391 | /* It has been found above, but on the "to_be_changed" | ||
392 | * list, not yet committed. Don't pull it in twice, | ||
393 | * wait for the transaction, then try again... | ||
394 | */ | ||
395 | if (!(flags & LC_GET_MAY_USE_UNCOMMITTED)) | ||
396 | RETURN(NULL); | ||
397 | /* ... unless the caller is aware of the implications, | ||
398 | * probably preparing a cumulative transaction. */ | ||
399 | ++e->refcnt; | ||
400 | ++lc->hits; | ||
401 | RETURN(e); | ||
402 | } | ||
403 | /* else: lc_new_number == lc_number; a real hit. */ | ||
384 | ++lc->hits; | 404 | ++lc->hits; |
385 | if (e->refcnt++ == 0) | 405 | if (e->refcnt++ == 0) |
386 | lc->used++; | 406 | lc->used++; |
387 | list_move(&e->list, &lc->in_use); /* Not evictable... */ | 407 | list_move(&e->list, &lc->in_use); /* Not evictable... */ |
388 | RETURN(e); | 408 | RETURN(e); |
389 | } | 409 | } |
410 | /* e == NULL */ | ||
390 | 411 | ||
391 | ++lc->misses; | 412 | ++lc->misses; |
392 | if (!may_change) | 413 | if (!(flags & LC_GET_MAY_CHANGE)) |
393 | RETURN(NULL); | ||
394 | |||
395 | /* It has been found above, but on the "to_be_changed" list, not yet | ||
396 | * committed. Don't pull it in twice, wait for the transaction, then | ||
397 | * try again */ | ||
398 | if (e) | ||
399 | RETURN(NULL); | 414 | RETURN(NULL); |
400 | 415 | ||
401 | /* To avoid races with lc_try_lock(), first, mark us dirty | 416 | /* To avoid races with lc_try_lock(), first, mark us dirty |
@@ -477,7 +492,27 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool | |||
477 | */ | 492 | */ |
478 | struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) | 493 | struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) |
479 | { | 494 | { |
480 | return __lc_get(lc, enr, 1); | 495 | return __lc_get(lc, enr, LC_GET_MAY_CHANGE); |
496 | } | ||
497 | |||
498 | /** | ||
499 | * lc_get_cumulative - like lc_get; also finds to-be-changed elements | ||
500 | * @lc: the lru cache to operate on | ||
501 | * @enr: the label to look up | ||
502 | * | ||
503 | * Unlike lc_get this also returns the element for @enr, if it is belonging to | ||
504 | * a pending transaction, so the return values are like for lc_get(), | ||
505 | * plus: | ||
506 | * | ||
507 | * pointer to an element already on the "to_be_changed" list. | ||
508 | * In this case, the cache was already marked %LC_DIRTY. | ||
509 | * | ||
510 | * Caller needs to make sure that the pending transaction is completed, | ||
511 | * before proceeding to actually use this element. | ||
512 | */ | ||
513 | struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr) | ||
514 | { | ||
515 | return __lc_get(lc, enr, LC_GET_MAY_CHANGE|LC_GET_MAY_USE_UNCOMMITTED); | ||
481 | } | 516 | } |
482 | 517 | ||
483 | /** | 518 | /** |
@@ -648,3 +683,4 @@ EXPORT_SYMBOL(lc_seq_printf_stats); | |||
648 | EXPORT_SYMBOL(lc_seq_dump_details); | 683 | EXPORT_SYMBOL(lc_seq_dump_details); |
649 | EXPORT_SYMBOL(lc_try_lock); | 684 | EXPORT_SYMBOL(lc_try_lock); |
650 | EXPORT_SYMBOL(lc_is_used); | 685 | EXPORT_SYMBOL(lc_is_used); |
686 | EXPORT_SYMBOL(lc_get_cumulative); | ||