diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-13 22:15:14 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-13 22:15:14 -0500 |
commit | d080827f850ba4df5b955d5ca8c8c0fc92fe18c0 (patch) | |
tree | 37262315200bbbe50bdd64ce3011951a92855159 /block | |
parent | cbd88cd4c07f9361914ab7fd7e21c9227986fe68 (diff) | |
parent | 8b63b6bfc1a551acf154061699028c7032d7890c (diff) |
Merge tag 'libnvdimm-for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull libnvdimm updates from Dan Williams:
"The bulk of this has appeared in -next and independently received a
build success notification from the kbuild robot. The 'for-4.5/block-
dax' topic branch was rebased over the weekend to drop the "block
device end-of-life" rework that Al would like to see re-implemented
with a notifier, and to address bug reports against the badblocks
integration.
There is pending feedback against "libnvdimm: Add a poison list and
export badblocks" received last week. Linda identified some localized
fixups that we will handle incrementally.
Summary:
- Media error handling: The 'badblocks' implementation that
originated in md-raid is up-levelled to a generic capability of a
block device. This initial implementation is limited to being
consulted in the pmem block-i/o path. Later, 'badblocks' will be
consulted when creating dax mappings.
- Raw block device dax: For virtualization and other cases that want
large contiguous mappings of persistent memory, add the capability
to dax-mmap a block device directly.
- Increased /dev/mem restrictions: Add an option to treat all
io-memory as IORESOURCE_EXCLUSIVE, i.e. disable /dev/mem access
while a driver is actively using an address range. This behavior
is controlled via the new CONFIG_IO_STRICT_DEVMEM option and can be
overridden by the existing "iomem=relaxed" kernel command line
option.
- Miscellaneous fixes include a 'pfn'-device huge page alignment fix,
block device shutdown crash fix, and other small libnvdimm fixes"
* tag 'libnvdimm-for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (32 commits)
block: kill disk_{check|set|clear|alloc}_badblocks
libnvdimm, pmem: nvdimm_read_bytes() badblocks support
pmem, dax: disable dax in the presence of bad blocks
pmem: fail io-requests to known bad blocks
libnvdimm: convert to statically allocated badblocks
libnvdimm: don't fail init for full badblocks list
block, badblocks: introduce devm_init_badblocks
block: clarify badblocks lifetime
badblocks: rename badblocks_free to badblocks_exit
libnvdimm, pmem: move definition of nvdimm_namespace_add_poison to nd.h
libnvdimm: Add a poison list and export badblocks
nfit_test: Enable DSMs for all test NFITs
md: convert to use the generic badblocks code
block: Add badblock management for gendisks
badblocks: Add core badblock management code
block: fix del_gendisk() vs blkdev_ioctl crash
block: enable dax for raw block devices
block: introduce bdev_file_inode()
restrict /dev/mem to idle io memory ranges
arch: consolidate CONFIG_STRICT_DEVM in lib/Kconfig.debug
...
Diffstat (limited to 'block')
-rw-r--r-- | block/Makefile | 2 | ||||
-rw-r--r-- | block/badblocks.c | 585 | ||||
-rw-r--r-- | block/genhd.c | 30 | ||||
-rw-r--r-- | block/ioctl.c | 71 |
4 files changed, 686 insertions, 2 deletions
diff --git a/block/Makefile b/block/Makefile index 00ecc97629db..db5f622c9d67 100644 --- a/block/Makefile +++ b/block/Makefile | |||
@@ -8,7 +8,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ | |||
8 | blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ | 8 | blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ |
9 | blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ | 9 | blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ |
10 | genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ | 10 | genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ |
11 | partitions/ | 11 | badblocks.o partitions/ |
12 | 12 | ||
13 | obj-$(CONFIG_BOUNCE) += bounce.o | 13 | obj-$(CONFIG_BOUNCE) += bounce.o |
14 | obj-$(CONFIG_BLK_DEV_BSG) += bsg.o | 14 | obj-$(CONFIG_BLK_DEV_BSG) += bsg.o |
diff --git a/block/badblocks.c b/block/badblocks.c new file mode 100644 index 000000000000..7be53cb1cc3c --- /dev/null +++ b/block/badblocks.c | |||
@@ -0,0 +1,585 @@ | |||
1 | /* | ||
2 | * Bad block management | ||
3 | * | ||
4 | * - Heavily based on MD badblocks code from Neil Brown | ||
5 | * | ||
6 | * Copyright (c) 2015, Intel Corporation. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify it | ||
9 | * under the terms and conditions of the GNU General Public License, | ||
10 | * version 2, as published by the Free Software Foundation. | ||
11 | * | ||
12 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
13 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
14 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
15 | * more details. | ||
16 | */ | ||
17 | |||
18 | #include <linux/badblocks.h> | ||
19 | #include <linux/seqlock.h> | ||
20 | #include <linux/device.h> | ||
21 | #include <linux/kernel.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/stddef.h> | ||
24 | #include <linux/types.h> | ||
25 | #include <linux/slab.h> | ||
26 | |||
27 | /** | ||
28 | * badblocks_check() - check a given range for bad sectors | ||
29 | * @bb: the badblocks structure that holds all badblock information | ||
30 | * @s: sector (start) at which to check for badblocks | ||
31 | * @sectors: number of sectors to check for badblocks | ||
32 | * @first_bad: pointer to store location of the first badblock | ||
33 | * @bad_sectors: pointer to store number of badblocks after @first_bad | ||
34 | * | ||
35 | * We can record which blocks on each device are 'bad' and so just | ||
36 | * fail those blocks, or that stripe, rather than the whole device. | ||
37 | * Entries in the bad-block table are 64bits wide. This comprises: | ||
38 | * Length of bad-range, in sectors: 0-511 for lengths 1-512 | ||
39 | * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) | ||
40 | * A 'shift' can be set so that larger blocks are tracked and | ||
41 | * consequently larger devices can be covered. | ||
42 | * 'Acknowledged' flag - 1 bit. - the most significant bit. | ||
43 | * | ||
44 | * Locking of the bad-block table uses a seqlock so badblocks_check | ||
45 | * might need to retry if it is very unlucky. | ||
46 | * We will sometimes want to check for bad blocks in a bi_end_io function, | ||
47 | * so we use the write_seqlock_irq variant. | ||
48 | * | ||
49 | * When looking for a bad block we specify a range and want to | ||
50 | * know if any block in the range is bad. So we binary-search | ||
51 | * to the last range that starts at-or-before the given endpoint, | ||
52 | * (or "before the sector after the target range") | ||
53 | * then see if it ends after the given start. | ||
54 | * | ||
55 | * Return: | ||
56 | * 0: there are no known bad blocks in the range | ||
57 | * 1: there are known bad block which are all acknowledged | ||
58 | * -1: there are bad blocks which have not yet been acknowledged in metadata. | ||
59 | * plus the start/length of the first bad section we overlap. | ||
60 | */ | ||
61 | int badblocks_check(struct badblocks *bb, sector_t s, int sectors, | ||
62 | sector_t *first_bad, int *bad_sectors) | ||
63 | { | ||
64 | int hi; | ||
65 | int lo; | ||
66 | u64 *p = bb->page; | ||
67 | int rv; | ||
68 | sector_t target = s + sectors; | ||
69 | unsigned seq; | ||
70 | |||
71 | if (bb->shift > 0) { | ||
72 | /* round the start down, and the end up */ | ||
73 | s >>= bb->shift; | ||
74 | target += (1<<bb->shift) - 1; | ||
75 | target >>= bb->shift; | ||
76 | sectors = target - s; | ||
77 | } | ||
78 | /* 'target' is now the first block after the bad range */ | ||
79 | |||
80 | retry: | ||
81 | seq = read_seqbegin(&bb->lock); | ||
82 | lo = 0; | ||
83 | rv = 0; | ||
84 | hi = bb->count; | ||
85 | |||
86 | /* Binary search between lo and hi for 'target' | ||
87 | * i.e. for the last range that starts before 'target' | ||
88 | */ | ||
89 | /* INVARIANT: ranges before 'lo' and at-or-after 'hi' | ||
90 | * are known not to be the last range before target. | ||
91 | * VARIANT: hi-lo is the number of possible | ||
92 | * ranges, and decreases until it reaches 1 | ||
93 | */ | ||
94 | while (hi - lo > 1) { | ||
95 | int mid = (lo + hi) / 2; | ||
96 | sector_t a = BB_OFFSET(p[mid]); | ||
97 | |||
98 | if (a < target) | ||
99 | /* This could still be the one, earlier ranges | ||
100 | * could not. | ||
101 | */ | ||
102 | lo = mid; | ||
103 | else | ||
104 | /* This and later ranges are definitely out. */ | ||
105 | hi = mid; | ||
106 | } | ||
107 | /* 'lo' might be the last that started before target, but 'hi' isn't */ | ||
108 | if (hi > lo) { | ||
109 | /* need to check all range that end after 's' to see if | ||
110 | * any are unacknowledged. | ||
111 | */ | ||
112 | while (lo >= 0 && | ||
113 | BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { | ||
114 | if (BB_OFFSET(p[lo]) < target) { | ||
115 | /* starts before the end, and finishes after | ||
116 | * the start, so they must overlap | ||
117 | */ | ||
118 | if (rv != -1 && BB_ACK(p[lo])) | ||
119 | rv = 1; | ||
120 | else | ||
121 | rv = -1; | ||
122 | *first_bad = BB_OFFSET(p[lo]); | ||
123 | *bad_sectors = BB_LEN(p[lo]); | ||
124 | } | ||
125 | lo--; | ||
126 | } | ||
127 | } | ||
128 | |||
129 | if (read_seqretry(&bb->lock, seq)) | ||
130 | goto retry; | ||
131 | |||
132 | return rv; | ||
133 | } | ||
134 | EXPORT_SYMBOL_GPL(badblocks_check); | ||
135 | |||
136 | /** | ||
137 | * badblocks_set() - Add a range of bad blocks to the table. | ||
138 | * @bb: the badblocks structure that holds all badblock information | ||
139 | * @s: first sector to mark as bad | ||
140 | * @sectors: number of sectors to mark as bad | ||
141 | * @acknowledged: weather to mark the bad sectors as acknowledged | ||
142 | * | ||
143 | * This might extend the table, or might contract it if two adjacent ranges | ||
144 | * can be merged. We binary-search to find the 'insertion' point, then | ||
145 | * decide how best to handle it. | ||
146 | * | ||
147 | * Return: | ||
148 | * 0: success | ||
149 | * 1: failed to set badblocks (out of space) | ||
150 | */ | ||
151 | int badblocks_set(struct badblocks *bb, sector_t s, int sectors, | ||
152 | int acknowledged) | ||
153 | { | ||
154 | u64 *p; | ||
155 | int lo, hi; | ||
156 | int rv = 0; | ||
157 | unsigned long flags; | ||
158 | |||
159 | if (bb->shift < 0) | ||
160 | /* badblocks are disabled */ | ||
161 | return 0; | ||
162 | |||
163 | if (bb->shift) { | ||
164 | /* round the start down, and the end up */ | ||
165 | sector_t next = s + sectors; | ||
166 | |||
167 | s >>= bb->shift; | ||
168 | next += (1<<bb->shift) - 1; | ||
169 | next >>= bb->shift; | ||
170 | sectors = next - s; | ||
171 | } | ||
172 | |||
173 | write_seqlock_irqsave(&bb->lock, flags); | ||
174 | |||
175 | p = bb->page; | ||
176 | lo = 0; | ||
177 | hi = bb->count; | ||
178 | /* Find the last range that starts at-or-before 's' */ | ||
179 | while (hi - lo > 1) { | ||
180 | int mid = (lo + hi) / 2; | ||
181 | sector_t a = BB_OFFSET(p[mid]); | ||
182 | |||
183 | if (a <= s) | ||
184 | lo = mid; | ||
185 | else | ||
186 | hi = mid; | ||
187 | } | ||
188 | if (hi > lo && BB_OFFSET(p[lo]) > s) | ||
189 | hi = lo; | ||
190 | |||
191 | if (hi > lo) { | ||
192 | /* we found a range that might merge with the start | ||
193 | * of our new range | ||
194 | */ | ||
195 | sector_t a = BB_OFFSET(p[lo]); | ||
196 | sector_t e = a + BB_LEN(p[lo]); | ||
197 | int ack = BB_ACK(p[lo]); | ||
198 | |||
199 | if (e >= s) { | ||
200 | /* Yes, we can merge with a previous range */ | ||
201 | if (s == a && s + sectors >= e) | ||
202 | /* new range covers old */ | ||
203 | ack = acknowledged; | ||
204 | else | ||
205 | ack = ack && acknowledged; | ||
206 | |||
207 | if (e < s + sectors) | ||
208 | e = s + sectors; | ||
209 | if (e - a <= BB_MAX_LEN) { | ||
210 | p[lo] = BB_MAKE(a, e-a, ack); | ||
211 | s = e; | ||
212 | } else { | ||
213 | /* does not all fit in one range, | ||
214 | * make p[lo] maximal | ||
215 | */ | ||
216 | if (BB_LEN(p[lo]) != BB_MAX_LEN) | ||
217 | p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); | ||
218 | s = a + BB_MAX_LEN; | ||
219 | } | ||
220 | sectors = e - s; | ||
221 | } | ||
222 | } | ||
223 | if (sectors && hi < bb->count) { | ||
224 | /* 'hi' points to the first range that starts after 's'. | ||
225 | * Maybe we can merge with the start of that range | ||
226 | */ | ||
227 | sector_t a = BB_OFFSET(p[hi]); | ||
228 | sector_t e = a + BB_LEN(p[hi]); | ||
229 | int ack = BB_ACK(p[hi]); | ||
230 | |||
231 | if (a <= s + sectors) { | ||
232 | /* merging is possible */ | ||
233 | if (e <= s + sectors) { | ||
234 | /* full overlap */ | ||
235 | e = s + sectors; | ||
236 | ack = acknowledged; | ||
237 | } else | ||
238 | ack = ack && acknowledged; | ||
239 | |||
240 | a = s; | ||
241 | if (e - a <= BB_MAX_LEN) { | ||
242 | p[hi] = BB_MAKE(a, e-a, ack); | ||
243 | s = e; | ||
244 | } else { | ||
245 | p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); | ||
246 | s = a + BB_MAX_LEN; | ||
247 | } | ||
248 | sectors = e - s; | ||
249 | lo = hi; | ||
250 | hi++; | ||
251 | } | ||
252 | } | ||
253 | if (sectors == 0 && hi < bb->count) { | ||
254 | /* we might be able to combine lo and hi */ | ||
255 | /* Note: 's' is at the end of 'lo' */ | ||
256 | sector_t a = BB_OFFSET(p[hi]); | ||
257 | int lolen = BB_LEN(p[lo]); | ||
258 | int hilen = BB_LEN(p[hi]); | ||
259 | int newlen = lolen + hilen - (s - a); | ||
260 | |||
261 | if (s >= a && newlen < BB_MAX_LEN) { | ||
262 | /* yes, we can combine them */ | ||
263 | int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); | ||
264 | |||
265 | p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); | ||
266 | memmove(p + hi, p + hi + 1, | ||
267 | (bb->count - hi - 1) * 8); | ||
268 | bb->count--; | ||
269 | } | ||
270 | } | ||
271 | while (sectors) { | ||
272 | /* didn't merge (it all). | ||
273 | * Need to add a range just before 'hi' | ||
274 | */ | ||
275 | if (bb->count >= MAX_BADBLOCKS) { | ||
276 | /* No room for more */ | ||
277 | rv = 1; | ||
278 | break; | ||
279 | } else { | ||
280 | int this_sectors = sectors; | ||
281 | |||
282 | memmove(p + hi + 1, p + hi, | ||
283 | (bb->count - hi) * 8); | ||
284 | bb->count++; | ||
285 | |||
286 | if (this_sectors > BB_MAX_LEN) | ||
287 | this_sectors = BB_MAX_LEN; | ||
288 | p[hi] = BB_MAKE(s, this_sectors, acknowledged); | ||
289 | sectors -= this_sectors; | ||
290 | s += this_sectors; | ||
291 | } | ||
292 | } | ||
293 | |||
294 | bb->changed = 1; | ||
295 | if (!acknowledged) | ||
296 | bb->unacked_exist = 1; | ||
297 | write_sequnlock_irqrestore(&bb->lock, flags); | ||
298 | |||
299 | return rv; | ||
300 | } | ||
301 | EXPORT_SYMBOL_GPL(badblocks_set); | ||
302 | |||
303 | /** | ||
304 | * badblocks_clear() - Remove a range of bad blocks to the table. | ||
305 | * @bb: the badblocks structure that holds all badblock information | ||
306 | * @s: first sector to mark as bad | ||
307 | * @sectors: number of sectors to mark as bad | ||
308 | * | ||
309 | * This may involve extending the table if we spilt a region, | ||
310 | * but it must not fail. So if the table becomes full, we just | ||
311 | * drop the remove request. | ||
312 | * | ||
313 | * Return: | ||
314 | * 0: success | ||
315 | * 1: failed to clear badblocks | ||
316 | */ | ||
317 | int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) | ||
318 | { | ||
319 | u64 *p; | ||
320 | int lo, hi; | ||
321 | sector_t target = s + sectors; | ||
322 | int rv = 0; | ||
323 | |||
324 | if (bb->shift > 0) { | ||
325 | /* When clearing we round the start up and the end down. | ||
326 | * This should not matter as the shift should align with | ||
327 | * the block size and no rounding should ever be needed. | ||
328 | * However it is better the think a block is bad when it | ||
329 | * isn't than to think a block is not bad when it is. | ||
330 | */ | ||
331 | s += (1<<bb->shift) - 1; | ||
332 | s >>= bb->shift; | ||
333 | target >>= bb->shift; | ||
334 | sectors = target - s; | ||
335 | } | ||
336 | |||
337 | write_seqlock_irq(&bb->lock); | ||
338 | |||
339 | p = bb->page; | ||
340 | lo = 0; | ||
341 | hi = bb->count; | ||
342 | /* Find the last range that starts before 'target' */ | ||
343 | while (hi - lo > 1) { | ||
344 | int mid = (lo + hi) / 2; | ||
345 | sector_t a = BB_OFFSET(p[mid]); | ||
346 | |||
347 | if (a < target) | ||
348 | lo = mid; | ||
349 | else | ||
350 | hi = mid; | ||
351 | } | ||
352 | if (hi > lo) { | ||
353 | /* p[lo] is the last range that could overlap the | ||
354 | * current range. Earlier ranges could also overlap, | ||
355 | * but only this one can overlap the end of the range. | ||
356 | */ | ||
357 | if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { | ||
358 | /* Partial overlap, leave the tail of this range */ | ||
359 | int ack = BB_ACK(p[lo]); | ||
360 | sector_t a = BB_OFFSET(p[lo]); | ||
361 | sector_t end = a + BB_LEN(p[lo]); | ||
362 | |||
363 | if (a < s) { | ||
364 | /* we need to split this range */ | ||
365 | if (bb->count >= MAX_BADBLOCKS) { | ||
366 | rv = -ENOSPC; | ||
367 | goto out; | ||
368 | } | ||
369 | memmove(p+lo+1, p+lo, (bb->count - lo) * 8); | ||
370 | bb->count++; | ||
371 | p[lo] = BB_MAKE(a, s-a, ack); | ||
372 | lo++; | ||
373 | } | ||
374 | p[lo] = BB_MAKE(target, end - target, ack); | ||
375 | /* there is no longer an overlap */ | ||
376 | hi = lo; | ||
377 | lo--; | ||
378 | } | ||
379 | while (lo >= 0 && | ||
380 | BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { | ||
381 | /* This range does overlap */ | ||
382 | if (BB_OFFSET(p[lo]) < s) { | ||
383 | /* Keep the early parts of this range. */ | ||
384 | int ack = BB_ACK(p[lo]); | ||
385 | sector_t start = BB_OFFSET(p[lo]); | ||
386 | |||
387 | p[lo] = BB_MAKE(start, s - start, ack); | ||
388 | /* now low doesn't overlap, so.. */ | ||
389 | break; | ||
390 | } | ||
391 | lo--; | ||
392 | } | ||
393 | /* 'lo' is strictly before, 'hi' is strictly after, | ||
394 | * anything between needs to be discarded | ||
395 | */ | ||
396 | if (hi - lo > 1) { | ||
397 | memmove(p+lo+1, p+hi, (bb->count - hi) * 8); | ||
398 | bb->count -= (hi - lo - 1); | ||
399 | } | ||
400 | } | ||
401 | |||
402 | bb->changed = 1; | ||
403 | out: | ||
404 | write_sequnlock_irq(&bb->lock); | ||
405 | return rv; | ||
406 | } | ||
407 | EXPORT_SYMBOL_GPL(badblocks_clear); | ||
408 | |||
409 | /** | ||
410 | * ack_all_badblocks() - Acknowledge all bad blocks in a list. | ||
411 | * @bb: the badblocks structure that holds all badblock information | ||
412 | * | ||
413 | * This only succeeds if ->changed is clear. It is used by | ||
414 | * in-kernel metadata updates | ||
415 | */ | ||
416 | void ack_all_badblocks(struct badblocks *bb) | ||
417 | { | ||
418 | if (bb->page == NULL || bb->changed) | ||
419 | /* no point even trying */ | ||
420 | return; | ||
421 | write_seqlock_irq(&bb->lock); | ||
422 | |||
423 | if (bb->changed == 0 && bb->unacked_exist) { | ||
424 | u64 *p = bb->page; | ||
425 | int i; | ||
426 | |||
427 | for (i = 0; i < bb->count ; i++) { | ||
428 | if (!BB_ACK(p[i])) { | ||
429 | sector_t start = BB_OFFSET(p[i]); | ||
430 | int len = BB_LEN(p[i]); | ||
431 | |||
432 | p[i] = BB_MAKE(start, len, 1); | ||
433 | } | ||
434 | } | ||
435 | bb->unacked_exist = 0; | ||
436 | } | ||
437 | write_sequnlock_irq(&bb->lock); | ||
438 | } | ||
439 | EXPORT_SYMBOL_GPL(ack_all_badblocks); | ||
440 | |||
441 | /** | ||
442 | * badblocks_show() - sysfs access to bad-blocks list | ||
443 | * @bb: the badblocks structure that holds all badblock information | ||
444 | * @page: buffer received from sysfs | ||
445 | * @unack: weather to show unacknowledged badblocks | ||
446 | * | ||
447 | * Return: | ||
448 | * Length of returned data | ||
449 | */ | ||
450 | ssize_t badblocks_show(struct badblocks *bb, char *page, int unack) | ||
451 | { | ||
452 | size_t len; | ||
453 | int i; | ||
454 | u64 *p = bb->page; | ||
455 | unsigned seq; | ||
456 | |||
457 | if (bb->shift < 0) | ||
458 | return 0; | ||
459 | |||
460 | retry: | ||
461 | seq = read_seqbegin(&bb->lock); | ||
462 | |||
463 | len = 0; | ||
464 | i = 0; | ||
465 | |||
466 | while (len < PAGE_SIZE && i < bb->count) { | ||
467 | sector_t s = BB_OFFSET(p[i]); | ||
468 | unsigned int length = BB_LEN(p[i]); | ||
469 | int ack = BB_ACK(p[i]); | ||
470 | |||
471 | i++; | ||
472 | |||
473 | if (unack && ack) | ||
474 | continue; | ||
475 | |||
476 | len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", | ||
477 | (unsigned long long)s << bb->shift, | ||
478 | length << bb->shift); | ||
479 | } | ||
480 | if (unack && len == 0) | ||
481 | bb->unacked_exist = 0; | ||
482 | |||
483 | if (read_seqretry(&bb->lock, seq)) | ||
484 | goto retry; | ||
485 | |||
486 | return len; | ||
487 | } | ||
488 | EXPORT_SYMBOL_GPL(badblocks_show); | ||
489 | |||
490 | /** | ||
491 | * badblocks_store() - sysfs access to bad-blocks list | ||
492 | * @bb: the badblocks structure that holds all badblock information | ||
493 | * @page: buffer received from sysfs | ||
494 | * @len: length of data received from sysfs | ||
495 | * @unack: weather to show unacknowledged badblocks | ||
496 | * | ||
497 | * Return: | ||
498 | * Length of the buffer processed or -ve error. | ||
499 | */ | ||
500 | ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, | ||
501 | int unack) | ||
502 | { | ||
503 | unsigned long long sector; | ||
504 | int length; | ||
505 | char newline; | ||
506 | |||
507 | switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { | ||
508 | case 3: | ||
509 | if (newline != '\n') | ||
510 | return -EINVAL; | ||
511 | case 2: | ||
512 | if (length <= 0) | ||
513 | return -EINVAL; | ||
514 | break; | ||
515 | default: | ||
516 | return -EINVAL; | ||
517 | } | ||
518 | |||
519 | if (badblocks_set(bb, sector, length, !unack)) | ||
520 | return -ENOSPC; | ||
521 | else | ||
522 | return len; | ||
523 | } | ||
524 | EXPORT_SYMBOL_GPL(badblocks_store); | ||
525 | |||
526 | static int __badblocks_init(struct device *dev, struct badblocks *bb, | ||
527 | int enable) | ||
528 | { | ||
529 | bb->dev = dev; | ||
530 | bb->count = 0; | ||
531 | if (enable) | ||
532 | bb->shift = 0; | ||
533 | else | ||
534 | bb->shift = -1; | ||
535 | if (dev) | ||
536 | bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL); | ||
537 | else | ||
538 | bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL); | ||
539 | if (!bb->page) { | ||
540 | bb->shift = -1; | ||
541 | return -ENOMEM; | ||
542 | } | ||
543 | seqlock_init(&bb->lock); | ||
544 | |||
545 | return 0; | ||
546 | } | ||
547 | |||
548 | /** | ||
549 | * badblocks_init() - initialize the badblocks structure | ||
550 | * @bb: the badblocks structure that holds all badblock information | ||
551 | * @enable: weather to enable badblocks accounting | ||
552 | * | ||
553 | * Return: | ||
554 | * 0: success | ||
555 | * -ve errno: on error | ||
556 | */ | ||
557 | int badblocks_init(struct badblocks *bb, int enable) | ||
558 | { | ||
559 | return __badblocks_init(NULL, bb, enable); | ||
560 | } | ||
561 | EXPORT_SYMBOL_GPL(badblocks_init); | ||
562 | |||
563 | int devm_init_badblocks(struct device *dev, struct badblocks *bb) | ||
564 | { | ||
565 | if (!bb) | ||
566 | return -EINVAL; | ||
567 | return __badblocks_init(dev, bb, 1); | ||
568 | } | ||
569 | EXPORT_SYMBOL_GPL(devm_init_badblocks); | ||
570 | |||
571 | /** | ||
572 | * badblocks_exit() - free the badblocks structure | ||
573 | * @bb: the badblocks structure that holds all badblock information | ||
574 | */ | ||
575 | void badblocks_exit(struct badblocks *bb) | ||
576 | { | ||
577 | if (!bb) | ||
578 | return; | ||
579 | if (bb->dev) | ||
580 | devm_kfree(bb->dev, bb->page); | ||
581 | else | ||
582 | kfree(bb->page); | ||
583 | bb->page = NULL; | ||
584 | } | ||
585 | EXPORT_SYMBOL_GPL(badblocks_exit); | ||
diff --git a/block/genhd.c b/block/genhd.c index e5cafa51567c..5aaeb2ad0fd3 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/idr.h> | 20 | #include <linux/idr.h> |
21 | #include <linux/log2.h> | 21 | #include <linux/log2.h> |
22 | #include <linux/pm_runtime.h> | 22 | #include <linux/pm_runtime.h> |
23 | #include <linux/badblocks.h> | ||
23 | 24 | ||
24 | #include "blk.h" | 25 | #include "blk.h" |
25 | 26 | ||
@@ -664,7 +665,6 @@ void del_gendisk(struct gendisk *disk) | |||
664 | 665 | ||
665 | kobject_put(disk->part0.holder_dir); | 666 | kobject_put(disk->part0.holder_dir); |
666 | kobject_put(disk->slave_dir); | 667 | kobject_put(disk->slave_dir); |
667 | disk->driverfs_dev = NULL; | ||
668 | if (!sysfs_deprecated) | 668 | if (!sysfs_deprecated) |
669 | sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); | 669 | sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); |
670 | pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); | 670 | pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); |
@@ -672,6 +672,31 @@ void del_gendisk(struct gendisk *disk) | |||
672 | } | 672 | } |
673 | EXPORT_SYMBOL(del_gendisk); | 673 | EXPORT_SYMBOL(del_gendisk); |
674 | 674 | ||
675 | /* sysfs access to bad-blocks list. */ | ||
676 | static ssize_t disk_badblocks_show(struct device *dev, | ||
677 | struct device_attribute *attr, | ||
678 | char *page) | ||
679 | { | ||
680 | struct gendisk *disk = dev_to_disk(dev); | ||
681 | |||
682 | if (!disk->bb) | ||
683 | return sprintf(page, "\n"); | ||
684 | |||
685 | return badblocks_show(disk->bb, page, 0); | ||
686 | } | ||
687 | |||
688 | static ssize_t disk_badblocks_store(struct device *dev, | ||
689 | struct device_attribute *attr, | ||
690 | const char *page, size_t len) | ||
691 | { | ||
692 | struct gendisk *disk = dev_to_disk(dev); | ||
693 | |||
694 | if (!disk->bb) | ||
695 | return -ENXIO; | ||
696 | |||
697 | return badblocks_store(disk->bb, page, len, 0); | ||
698 | } | ||
699 | |||
675 | /** | 700 | /** |
676 | * get_gendisk - get partitioning information for a given device | 701 | * get_gendisk - get partitioning information for a given device |
677 | * @devt: device to get partitioning information for | 702 | * @devt: device to get partitioning information for |
@@ -990,6 +1015,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show, | |||
990 | static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); | 1015 | static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); |
991 | static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); | 1016 | static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); |
992 | static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); | 1017 | static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); |
1018 | static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show, | ||
1019 | disk_badblocks_store); | ||
993 | #ifdef CONFIG_FAIL_MAKE_REQUEST | 1020 | #ifdef CONFIG_FAIL_MAKE_REQUEST |
994 | static struct device_attribute dev_attr_fail = | 1021 | static struct device_attribute dev_attr_fail = |
995 | __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); | 1022 | __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); |
@@ -1011,6 +1038,7 @@ static struct attribute *disk_attrs[] = { | |||
1011 | &dev_attr_capability.attr, | 1038 | &dev_attr_capability.attr, |
1012 | &dev_attr_stat.attr, | 1039 | &dev_attr_stat.attr, |
1013 | &dev_attr_inflight.attr, | 1040 | &dev_attr_inflight.attr, |
1041 | &dev_attr_badblocks.attr, | ||
1014 | #ifdef CONFIG_FAIL_MAKE_REQUEST | 1042 | #ifdef CONFIG_FAIL_MAKE_REQUEST |
1015 | &dev_attr_fail.attr, | 1043 | &dev_attr_fail.attr, |
1016 | #endif | 1044 | #endif |
diff --git a/block/ioctl.c b/block/ioctl.c index 0918aed2d847..2c84683aada5 100644 --- a/block/ioctl.c +++ b/block/ioctl.c | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/gfp.h> | 4 | #include <linux/gfp.h> |
5 | #include <linux/blkpg.h> | 5 | #include <linux/blkpg.h> |
6 | #include <linux/hdreg.h> | 6 | #include <linux/hdreg.h> |
7 | #include <linux/badblocks.h> | ||
7 | #include <linux/backing-dev.h> | 8 | #include <linux/backing-dev.h> |
8 | #include <linux/fs.h> | 9 | #include <linux/fs.h> |
9 | #include <linux/blktrace_api.h> | 10 | #include <linux/blktrace_api.h> |
@@ -406,6 +407,71 @@ static inline int is_unrecognized_ioctl(int ret) | |||
406 | ret == -ENOIOCTLCMD; | 407 | ret == -ENOIOCTLCMD; |
407 | } | 408 | } |
408 | 409 | ||
410 | #ifdef CONFIG_FS_DAX | ||
411 | bool blkdev_dax_capable(struct block_device *bdev) | ||
412 | { | ||
413 | struct gendisk *disk = bdev->bd_disk; | ||
414 | |||
415 | if (!disk->fops->direct_access) | ||
416 | return false; | ||
417 | |||
418 | /* | ||
419 | * If the partition is not aligned on a page boundary, we can't | ||
420 | * do dax I/O to it. | ||
421 | */ | ||
422 | if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) | ||
423 | || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) | ||
424 | return false; | ||
425 | |||
426 | /* | ||
427 | * If the device has known bad blocks, force all I/O through the | ||
428 | * driver / page cache. | ||
429 | * | ||
430 | * TODO: support finer grained dax error handling | ||
431 | */ | ||
432 | if (disk->bb && disk->bb->count) | ||
433 | return false; | ||
434 | |||
435 | return true; | ||
436 | } | ||
437 | |||
438 | static int blkdev_daxset(struct block_device *bdev, unsigned long argp) | ||
439 | { | ||
440 | unsigned long arg; | ||
441 | int rc = 0; | ||
442 | |||
443 | if (!capable(CAP_SYS_ADMIN)) | ||
444 | return -EACCES; | ||
445 | |||
446 | if (get_user(arg, (int __user *)(argp))) | ||
447 | return -EFAULT; | ||
448 | arg = !!arg; | ||
449 | if (arg == !!(bdev->bd_inode->i_flags & S_DAX)) | ||
450 | return 0; | ||
451 | |||
452 | if (arg) | ||
453 | arg = S_DAX; | ||
454 | |||
455 | if (arg && !blkdev_dax_capable(bdev)) | ||
456 | return -ENOTTY; | ||
457 | |||
458 | mutex_lock(&bdev->bd_inode->i_mutex); | ||
459 | if (bdev->bd_map_count == 0) | ||
460 | inode_set_flags(bdev->bd_inode, arg, S_DAX); | ||
461 | else | ||
462 | rc = -EBUSY; | ||
463 | mutex_unlock(&bdev->bd_inode->i_mutex); | ||
464 | return rc; | ||
465 | } | ||
466 | #else | ||
467 | static int blkdev_daxset(struct block_device *bdev, int arg) | ||
468 | { | ||
469 | if (arg) | ||
470 | return -ENOTTY; | ||
471 | return 0; | ||
472 | } | ||
473 | #endif | ||
474 | |||
409 | static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, | 475 | static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, |
410 | unsigned cmd, unsigned long arg) | 476 | unsigned cmd, unsigned long arg) |
411 | { | 477 | { |
@@ -568,6 +634,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, | |||
568 | case BLKTRACESETUP: | 634 | case BLKTRACESETUP: |
569 | case BLKTRACETEARDOWN: | 635 | case BLKTRACETEARDOWN: |
570 | return blk_trace_ioctl(bdev, cmd, argp); | 636 | return blk_trace_ioctl(bdev, cmd, argp); |
637 | case BLKDAXSET: | ||
638 | return blkdev_daxset(bdev, arg); | ||
639 | case BLKDAXGET: | ||
640 | return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX)); | ||
641 | break; | ||
571 | case IOC_PR_REGISTER: | 642 | case IOC_PR_REGISTER: |
572 | return blkdev_pr_register(bdev, argp); | 643 | return blkdev_pr_register(bdev, argp); |
573 | case IOC_PR_RESERVE: | 644 | case IOC_PR_RESERVE: |