diff options
Diffstat (limited to 'drivers/block/brd.c')
-rw-r--r-- | drivers/block/brd.c | 548 |
1 files changed, 548 insertions, 0 deletions
diff --git a/drivers/block/brd.c b/drivers/block/brd.c new file mode 100644 index 000000000000..50b659bedc8f --- /dev/null +++ b/drivers/block/brd.c | |||
@@ -0,0 +1,548 @@ | |||
1 | /* | ||
2 | * Ram backed block device driver. | ||
3 | * | ||
4 | * Copyright (C) 2007 Nick Piggin | ||
5 | * Copyright (C) 2007 Novell Inc. | ||
6 | * | ||
7 | * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright | ||
8 | * of their respective owners. | ||
9 | */ | ||
10 | |||
11 | #include <linux/init.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/moduleparam.h> | ||
14 | #include <linux/major.h> | ||
15 | #include <linux/blkdev.h> | ||
16 | #include <linux/bio.h> | ||
17 | #include <linux/highmem.h> | ||
18 | #include <linux/gfp.h> | ||
19 | #include <linux/radix-tree.h> | ||
20 | #include <linux/buffer_head.h> /* invalidate_bh_lrus() */ | ||
21 | |||
22 | #include <asm/uaccess.h> | ||
23 | |||
24 | #define SECTOR_SHIFT 9 | ||
25 | #define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) | ||
26 | #define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) | ||
27 | |||
28 | /* | ||
29 | * Each block ramdisk device has a radix_tree brd_pages of pages that stores | ||
30 | * the pages containing the block device's contents. A brd page's ->index is | ||
31 | * its offset in PAGE_SIZE units. This is similar to, but in no way connected | ||
32 | * with, the kernel's pagecache or buffer cache (which sit above our block | ||
33 | * device). | ||
34 | */ | ||
35 | struct brd_device { | ||
36 | int brd_number; | ||
37 | int brd_refcnt; | ||
38 | loff_t brd_offset; | ||
39 | loff_t brd_sizelimit; | ||
40 | unsigned brd_blocksize; | ||
41 | |||
42 | struct request_queue *brd_queue; | ||
43 | struct gendisk *brd_disk; | ||
44 | struct list_head brd_list; | ||
45 | |||
46 | /* | ||
47 | * Backing store of pages and lock to protect it. This is the contents | ||
48 | * of the block device. | ||
49 | */ | ||
50 | spinlock_t brd_lock; | ||
51 | struct radix_tree_root brd_pages; | ||
52 | }; | ||
53 | |||
54 | /* | ||
55 | * Look up and return a brd's page for a given sector. | ||
56 | */ | ||
57 | static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) | ||
58 | { | ||
59 | pgoff_t idx; | ||
60 | struct page *page; | ||
61 | |||
62 | /* | ||
63 | * The page lifetime is protected by the fact that we have opened the | ||
64 | * device node -- brd pages will never be deleted under us, so we | ||
65 | * don't need any further locking or refcounting. | ||
66 | * | ||
67 | * This is strictly true for the radix-tree nodes as well (ie. we | ||
68 | * don't actually need the rcu_read_lock()), however that is not a | ||
69 | * documented feature of the radix-tree API so it is better to be | ||
70 | * safe here (we don't have total exclusion from radix tree updates | ||
71 | * here, only deletes). | ||
72 | */ | ||
73 | rcu_read_lock(); | ||
74 | idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */ | ||
75 | page = radix_tree_lookup(&brd->brd_pages, idx); | ||
76 | rcu_read_unlock(); | ||
77 | |||
78 | BUG_ON(page && page->index != idx); | ||
79 | |||
80 | return page; | ||
81 | } | ||
82 | |||
83 | /* | ||
84 | * Look up and return a brd's page for a given sector. | ||
85 | * If one does not exist, allocate an empty page, and insert that. Then | ||
86 | * return it. | ||
87 | */ | ||
88 | static struct page *brd_insert_page(struct brd_device *brd, sector_t sector) | ||
89 | { | ||
90 | pgoff_t idx; | ||
91 | struct page *page; | ||
92 | |||
93 | page = brd_lookup_page(brd, sector); | ||
94 | if (page) | ||
95 | return page; | ||
96 | |||
97 | /* | ||
98 | * Must use NOIO because we don't want to recurse back into the | ||
99 | * block or filesystem layers from page reclaim. | ||
100 | */ | ||
101 | page = alloc_page(GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO); | ||
102 | if (!page) | ||
103 | return NULL; | ||
104 | |||
105 | if (radix_tree_preload(GFP_NOIO)) { | ||
106 | __free_page(page); | ||
107 | return NULL; | ||
108 | } | ||
109 | |||
110 | spin_lock(&brd->brd_lock); | ||
111 | idx = sector >> PAGE_SECTORS_SHIFT; | ||
112 | if (radix_tree_insert(&brd->brd_pages, idx, page)) { | ||
113 | __free_page(page); | ||
114 | page = radix_tree_lookup(&brd->brd_pages, idx); | ||
115 | BUG_ON(!page); | ||
116 | BUG_ON(page->index != idx); | ||
117 | } else | ||
118 | page->index = idx; | ||
119 | spin_unlock(&brd->brd_lock); | ||
120 | |||
121 | radix_tree_preload_end(); | ||
122 | |||
123 | return page; | ||
124 | } | ||
125 | |||
126 | /* | ||
127 | * Free all backing store pages and radix tree. This must only be called when | ||
128 | * there are no other users of the device. | ||
129 | */ | ||
130 | #define FREE_BATCH 16 | ||
131 | static void brd_free_pages(struct brd_device *brd) | ||
132 | { | ||
133 | unsigned long pos = 0; | ||
134 | struct page *pages[FREE_BATCH]; | ||
135 | int nr_pages; | ||
136 | |||
137 | do { | ||
138 | int i; | ||
139 | |||
140 | nr_pages = radix_tree_gang_lookup(&brd->brd_pages, | ||
141 | (void **)pages, pos, FREE_BATCH); | ||
142 | |||
143 | for (i = 0; i < nr_pages; i++) { | ||
144 | void *ret; | ||
145 | |||
146 | BUG_ON(pages[i]->index < pos); | ||
147 | pos = pages[i]->index; | ||
148 | ret = radix_tree_delete(&brd->brd_pages, pos); | ||
149 | BUG_ON(!ret || ret != pages[i]); | ||
150 | __free_page(pages[i]); | ||
151 | } | ||
152 | |||
153 | pos++; | ||
154 | |||
155 | /* | ||
156 | * This assumes radix_tree_gang_lookup always returns as | ||
157 | * many pages as possible. If the radix-tree code changes, | ||
158 | * so will this have to. | ||
159 | */ | ||
160 | } while (nr_pages == FREE_BATCH); | ||
161 | } | ||
162 | |||
163 | /* | ||
164 | * copy_to_brd_setup must be called before copy_to_brd. It may sleep. | ||
165 | */ | ||
166 | static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n) | ||
167 | { | ||
168 | unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; | ||
169 | size_t copy; | ||
170 | |||
171 | copy = min_t(size_t, n, PAGE_SIZE - offset); | ||
172 | if (!brd_insert_page(brd, sector)) | ||
173 | return -ENOMEM; | ||
174 | if (copy < n) { | ||
175 | sector += copy >> SECTOR_SHIFT; | ||
176 | if (!brd_insert_page(brd, sector)) | ||
177 | return -ENOMEM; | ||
178 | } | ||
179 | return 0; | ||
180 | } | ||
181 | |||
182 | /* | ||
183 | * Copy n bytes from src to the brd starting at sector. Does not sleep. | ||
184 | */ | ||
185 | static void copy_to_brd(struct brd_device *brd, const void *src, | ||
186 | sector_t sector, size_t n) | ||
187 | { | ||
188 | struct page *page; | ||
189 | void *dst; | ||
190 | unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; | ||
191 | size_t copy; | ||
192 | |||
193 | copy = min_t(size_t, n, PAGE_SIZE - offset); | ||
194 | page = brd_lookup_page(brd, sector); | ||
195 | BUG_ON(!page); | ||
196 | |||
197 | dst = kmap_atomic(page, KM_USER1); | ||
198 | memcpy(dst + offset, src, copy); | ||
199 | kunmap_atomic(dst, KM_USER1); | ||
200 | |||
201 | if (copy < n) { | ||
202 | src += copy; | ||
203 | sector += copy >> SECTOR_SHIFT; | ||
204 | copy = n - copy; | ||
205 | page = brd_lookup_page(brd, sector); | ||
206 | BUG_ON(!page); | ||
207 | |||
208 | dst = kmap_atomic(page, KM_USER1); | ||
209 | memcpy(dst, src, copy); | ||
210 | kunmap_atomic(dst, KM_USER1); | ||
211 | } | ||
212 | } | ||
213 | |||
214 | /* | ||
215 | * Copy n bytes to dst from the brd starting at sector. Does not sleep. | ||
216 | */ | ||
217 | static void copy_from_brd(void *dst, struct brd_device *brd, | ||
218 | sector_t sector, size_t n) | ||
219 | { | ||
220 | struct page *page; | ||
221 | void *src; | ||
222 | unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; | ||
223 | size_t copy; | ||
224 | |||
225 | copy = min_t(size_t, n, PAGE_SIZE - offset); | ||
226 | page = brd_lookup_page(brd, sector); | ||
227 | if (page) { | ||
228 | src = kmap_atomic(page, KM_USER1); | ||
229 | memcpy(dst, src + offset, copy); | ||
230 | kunmap_atomic(src, KM_USER1); | ||
231 | } else | ||
232 | memset(dst, 0, copy); | ||
233 | |||
234 | if (copy < n) { | ||
235 | dst += copy; | ||
236 | sector += copy >> SECTOR_SHIFT; | ||
237 | copy = n - copy; | ||
238 | page = brd_lookup_page(brd, sector); | ||
239 | if (page) { | ||
240 | src = kmap_atomic(page, KM_USER1); | ||
241 | memcpy(dst, src, copy); | ||
242 | kunmap_atomic(src, KM_USER1); | ||
243 | } else | ||
244 | memset(dst, 0, copy); | ||
245 | } | ||
246 | } | ||
247 | |||
248 | /* | ||
249 | * Process a single bvec of a bio. | ||
250 | */ | ||
251 | static int brd_do_bvec(struct brd_device *brd, struct page *page, | ||
252 | unsigned int len, unsigned int off, int rw, | ||
253 | sector_t sector) | ||
254 | { | ||
255 | void *mem; | ||
256 | int err = 0; | ||
257 | |||
258 | if (rw != READ) { | ||
259 | err = copy_to_brd_setup(brd, sector, len); | ||
260 | if (err) | ||
261 | goto out; | ||
262 | } | ||
263 | |||
264 | mem = kmap_atomic(page, KM_USER0); | ||
265 | if (rw == READ) { | ||
266 | copy_from_brd(mem + off, brd, sector, len); | ||
267 | flush_dcache_page(page); | ||
268 | } else | ||
269 | copy_to_brd(brd, mem + off, sector, len); | ||
270 | kunmap_atomic(mem, KM_USER0); | ||
271 | |||
272 | out: | ||
273 | return err; | ||
274 | } | ||
275 | |||
276 | static int brd_make_request(struct request_queue *q, struct bio *bio) | ||
277 | { | ||
278 | struct block_device *bdev = bio->bi_bdev; | ||
279 | struct brd_device *brd = bdev->bd_disk->private_data; | ||
280 | int rw; | ||
281 | struct bio_vec *bvec; | ||
282 | sector_t sector; | ||
283 | int i; | ||
284 | int err = -EIO; | ||
285 | |||
286 | sector = bio->bi_sector; | ||
287 | if (sector + (bio->bi_size >> SECTOR_SHIFT) > | ||
288 | get_capacity(bdev->bd_disk)) | ||
289 | goto out; | ||
290 | |||
291 | rw = bio_rw(bio); | ||
292 | if (rw == READA) | ||
293 | rw = READ; | ||
294 | |||
295 | bio_for_each_segment(bvec, bio, i) { | ||
296 | unsigned int len = bvec->bv_len; | ||
297 | err = brd_do_bvec(brd, bvec->bv_page, len, | ||
298 | bvec->bv_offset, rw, sector); | ||
299 | if (err) | ||
300 | break; | ||
301 | sector += len >> SECTOR_SHIFT; | ||
302 | } | ||
303 | |||
304 | out: | ||
305 | bio_endio(bio, err); | ||
306 | |||
307 | return 0; | ||
308 | } | ||
309 | |||
310 | static int brd_ioctl(struct inode *inode, struct file *file, | ||
311 | unsigned int cmd, unsigned long arg) | ||
312 | { | ||
313 | int error; | ||
314 | struct block_device *bdev = inode->i_bdev; | ||
315 | struct brd_device *brd = bdev->bd_disk->private_data; | ||
316 | |||
317 | if (cmd != BLKFLSBUF) | ||
318 | return -ENOTTY; | ||
319 | |||
320 | /* | ||
321 | * ram device BLKFLSBUF has special semantics, we want to actually | ||
322 | * release and destroy the ramdisk data. | ||
323 | */ | ||
324 | mutex_lock(&bdev->bd_mutex); | ||
325 | error = -EBUSY; | ||
326 | if (bdev->bd_openers <= 1) { | ||
327 | /* | ||
328 | * Invalidate the cache first, so it isn't written | ||
329 | * back to the device. | ||
330 | * | ||
331 | * Another thread might instantiate more buffercache here, | ||
332 | * but there is not much we can do to close that race. | ||
333 | */ | ||
334 | invalidate_bh_lrus(); | ||
335 | truncate_inode_pages(bdev->bd_inode->i_mapping, 0); | ||
336 | brd_free_pages(brd); | ||
337 | error = 0; | ||
338 | } | ||
339 | mutex_unlock(&bdev->bd_mutex); | ||
340 | |||
341 | return error; | ||
342 | } | ||
343 | |||
344 | static struct block_device_operations brd_fops = { | ||
345 | .owner = THIS_MODULE, | ||
346 | .ioctl = brd_ioctl, | ||
347 | }; | ||
348 | |||
349 | /* | ||
350 | * And now the modules code and kernel interface. | ||
351 | */ | ||
352 | static int rd_nr; | ||
353 | int rd_size = CONFIG_BLK_DEV_RAM_SIZE; | ||
354 | module_param(rd_nr, int, 0); | ||
355 | MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices"); | ||
356 | module_param(rd_size, int, 0); | ||
357 | MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); | ||
358 | MODULE_LICENSE("GPL"); | ||
359 | MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); | ||
360 | |||
361 | #ifndef MODULE | ||
362 | /* Legacy boot options - nonmodular */ | ||
363 | static int __init ramdisk_size(char *str) | ||
364 | { | ||
365 | rd_size = simple_strtol(str, NULL, 0); | ||
366 | return 1; | ||
367 | } | ||
368 | static int __init ramdisk_size2(char *str) | ||
369 | { | ||
370 | return ramdisk_size(str); | ||
371 | } | ||
372 | __setup("ramdisk=", ramdisk_size); | ||
373 | __setup("ramdisk_size=", ramdisk_size2); | ||
374 | #endif | ||
375 | |||
376 | /* | ||
377 | * The device scheme is derived from loop.c. Keep them in synch where possible | ||
378 | * (should share code eventually). | ||
379 | */ | ||
380 | static LIST_HEAD(brd_devices); | ||
381 | static DEFINE_MUTEX(brd_devices_mutex); | ||
382 | |||
383 | static struct brd_device *brd_alloc(int i) | ||
384 | { | ||
385 | struct brd_device *brd; | ||
386 | struct gendisk *disk; | ||
387 | |||
388 | brd = kzalloc(sizeof(*brd), GFP_KERNEL); | ||
389 | if (!brd) | ||
390 | goto out; | ||
391 | brd->brd_number = i; | ||
392 | spin_lock_init(&brd->brd_lock); | ||
393 | INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC); | ||
394 | |||
395 | brd->brd_queue = blk_alloc_queue(GFP_KERNEL); | ||
396 | if (!brd->brd_queue) | ||
397 | goto out_free_dev; | ||
398 | blk_queue_make_request(brd->brd_queue, brd_make_request); | ||
399 | blk_queue_max_sectors(brd->brd_queue, 1024); | ||
400 | blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); | ||
401 | |||
402 | disk = brd->brd_disk = alloc_disk(1); | ||
403 | if (!disk) | ||
404 | goto out_free_queue; | ||
405 | disk->major = RAMDISK_MAJOR; | ||
406 | disk->first_minor = i; | ||
407 | disk->fops = &brd_fops; | ||
408 | disk->private_data = brd; | ||
409 | disk->queue = brd->brd_queue; | ||
410 | sprintf(disk->disk_name, "ram%d", i); | ||
411 | set_capacity(disk, rd_size * 2); | ||
412 | |||
413 | return brd; | ||
414 | |||
415 | out_free_queue: | ||
416 | blk_cleanup_queue(brd->brd_queue); | ||
417 | out_free_dev: | ||
418 | kfree(brd); | ||
419 | out: | ||
420 | return NULL; | ||
421 | } | ||
422 | |||
423 | static void brd_free(struct brd_device *brd) | ||
424 | { | ||
425 | put_disk(brd->brd_disk); | ||
426 | blk_cleanup_queue(brd->brd_queue); | ||
427 | brd_free_pages(brd); | ||
428 | kfree(brd); | ||
429 | } | ||
430 | |||
431 | static struct brd_device *brd_init_one(int i) | ||
432 | { | ||
433 | struct brd_device *brd; | ||
434 | |||
435 | list_for_each_entry(brd, &brd_devices, brd_list) { | ||
436 | if (brd->brd_number == i) | ||
437 | goto out; | ||
438 | } | ||
439 | |||
440 | brd = brd_alloc(i); | ||
441 | if (brd) { | ||
442 | add_disk(brd->brd_disk); | ||
443 | list_add_tail(&brd->brd_list, &brd_devices); | ||
444 | } | ||
445 | out: | ||
446 | return brd; | ||
447 | } | ||
448 | |||
449 | static void brd_del_one(struct brd_device *brd) | ||
450 | { | ||
451 | list_del(&brd->brd_list); | ||
452 | del_gendisk(brd->brd_disk); | ||
453 | brd_free(brd); | ||
454 | } | ||
455 | |||
456 | static struct kobject *brd_probe(dev_t dev, int *part, void *data) | ||
457 | { | ||
458 | struct brd_device *brd; | ||
459 | struct kobject *kobj; | ||
460 | |||
461 | mutex_lock(&brd_devices_mutex); | ||
462 | brd = brd_init_one(dev & MINORMASK); | ||
463 | kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM); | ||
464 | mutex_unlock(&brd_devices_mutex); | ||
465 | |||
466 | *part = 0; | ||
467 | return kobj; | ||
468 | } | ||
469 | |||
470 | static int __init brd_init(void) | ||
471 | { | ||
472 | int i, nr; | ||
473 | unsigned long range; | ||
474 | struct brd_device *brd, *next; | ||
475 | |||
476 | /* | ||
477 | * brd module now has a feature to instantiate underlying device | ||
478 | * structure on-demand, provided that there is an access dev node. | ||
479 | * However, this will not work well with user space tool that doesn't | ||
480 | * know about such "feature". In order to not break any existing | ||
481 | * tool, we do the following: | ||
482 | * | ||
483 | * (1) if rd_nr is specified, create that many upfront, and this | ||
484 | * also becomes a hard limit. | ||
485 | * (2) if rd_nr is not specified, create 1 rd device on module | ||
486 | * load, user can further extend brd device by create dev node | ||
487 | * themselves and have kernel automatically instantiate actual | ||
488 | * device on-demand. | ||
489 | */ | ||
490 | if (rd_nr > 1UL << MINORBITS) | ||
491 | return -EINVAL; | ||
492 | |||
493 | if (rd_nr) { | ||
494 | nr = rd_nr; | ||
495 | range = rd_nr; | ||
496 | } else { | ||
497 | nr = CONFIG_BLK_DEV_RAM_COUNT; | ||
498 | range = 1UL << MINORBITS; | ||
499 | } | ||
500 | |||
501 | if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) | ||
502 | return -EIO; | ||
503 | |||
504 | for (i = 0; i < nr; i++) { | ||
505 | brd = brd_alloc(i); | ||
506 | if (!brd) | ||
507 | goto out_free; | ||
508 | list_add_tail(&brd->brd_list, &brd_devices); | ||
509 | } | ||
510 | |||
511 | /* point of no return */ | ||
512 | |||
513 | list_for_each_entry(brd, &brd_devices, brd_list) | ||
514 | add_disk(brd->brd_disk); | ||
515 | |||
516 | blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range, | ||
517 | THIS_MODULE, brd_probe, NULL, NULL); | ||
518 | |||
519 | printk(KERN_INFO "brd: module loaded\n"); | ||
520 | return 0; | ||
521 | |||
522 | out_free: | ||
523 | list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { | ||
524 | list_del(&brd->brd_list); | ||
525 | brd_free(brd); | ||
526 | } | ||
527 | |||
528 | unregister_blkdev(RAMDISK_MAJOR, "brd"); | ||
529 | return -ENOMEM; | ||
530 | } | ||
531 | |||
532 | static void __exit brd_exit(void) | ||
533 | { | ||
534 | unsigned long range; | ||
535 | struct brd_device *brd, *next; | ||
536 | |||
537 | range = rd_nr ? rd_nr : 1UL << MINORBITS; | ||
538 | |||
539 | list_for_each_entry_safe(brd, next, &brd_devices, brd_list) | ||
540 | brd_del_one(brd); | ||
541 | |||
542 | blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range); | ||
543 | unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); | ||
544 | } | ||
545 | |||
546 | module_init(brd_init); | ||
547 | module_exit(brd_exit); | ||
548 | |||