aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2008-02-08 07:19:49 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-08 12:22:30 -0500
commit9db5579be4bb5320c3248f6acf807aedf05ae143 (patch)
treefde09bbeb427946b30d2e0fb6b00494a42488052 /drivers/block
parentdaeb51e62cacde31c8245866e1096ff79a0c83fe (diff)
rewrite rd
This is a rewrite of the ramdisk block device driver. The old one is really difficult because it effectively implements a block device which serves data out of its own buffer cache. It relies on the dirty bit being set, to pin its backing store in cache, however there are non trivial paths which can clear the dirty bit (eg. try_to_free_buffers()), which had recently lead to data corruption. And in general it is completely wrong for a block device driver to do this. The new one is more like a regular block device driver. It has no idea about vm/vfs stuff. It's backing store is similar to the buffer cache (a simple radix-tree of pages), but it doesn't know anything about page cache (the pages in the radix tree are not pagecache pages). There is one slight downside -- direct block device access and filesystem metadata access goes through an extra copy and gets stored in RAM twice. However, this downside is only slight, because the real buffercache of the device is now reclaimable (because we're not playing crazy games with it), so under memory intensive situations, footprint should effectively be the same -- maybe even a slight advantage to the new driver because it can also reclaim buffer heads. The fact that it now goes through all the regular vm/fs paths makes it much more useful for testing, too. text data bss dec hex filename 2837 849 384 4070 fe6 drivers/block/rd.o 3528 371 12 3911 f47 drivers/block/brd.o Text is larger, but data and bss are smaller, making total size smaller. A few other nice things about it: - Similar structure and layout to the new loop device handlinag. - Dynamic ramdisk creation. - Runtime flexible buffer head size (because it is no longer part of the ramdisk code). - Boot / load time flexible ramdisk size, which could easily be extended to a per-ramdisk runtime changeable size (eg. with an ioctl). - Can use highmem for the backing store. [akpm@linux-foundation.org: fix build] [byron.bbradley@gmail.com: make rd_size non-static] Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Byron Bradley <byron.bbradley@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/Kconfig12
-rw-r--r--drivers/block/Makefile2
-rw-r--r--drivers/block/brd.c548
-rw-r--r--drivers/block/rd.c537
4 files changed, 550 insertions, 549 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 64e5148d82bc..8be67cd3fe01 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -322,7 +322,7 @@ config BLK_DEV_UB
322 If unsure, say N. 322 If unsure, say N.
323 323
324config BLK_DEV_RAM 324config BLK_DEV_RAM
325 tristate "RAM disk support" 325 tristate "RAM block device support"
326 ---help--- 326 ---help---
327 Saying Y here will allow you to use a portion of your RAM memory as 327 Saying Y here will allow you to use a portion of your RAM memory as
328 a block device, so that you can make file systems on it, read and 328 a block device, so that you can make file systems on it, read and
@@ -357,16 +357,6 @@ config BLK_DEV_RAM_SIZE
357 The default value is 4096 kilobytes. Only change this if you know 357 The default value is 4096 kilobytes. Only change this if you know
358 what you are doing. 358 what you are doing.
359 359
360config BLK_DEV_RAM_BLOCKSIZE
361 int "Default RAM disk block size (bytes)"
362 depends on BLK_DEV_RAM
363 default "1024"
364 help
365 The default value is 1024 bytes. PAGE_SIZE is a much more
366 efficient choice however. The default is kept to ensure initrd
367 setups function - apparently needed by the rd_load_image routine
368 that supposes the filesystem in the image uses a 1024 blocksize.
369
370config CDROM_PKTCDVD 360config CDROM_PKTCDVD
371 tristate "Packet writing on CD/DVD media" 361 tristate "Packet writing on CD/DVD media"
372 depends on !UML 362 depends on !UML
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 7691505a2e12..01c972415cb2 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_AMIGA_FLOPPY) += amiflop.o
11obj-$(CONFIG_PS3_DISK) += ps3disk.o 11obj-$(CONFIG_PS3_DISK) += ps3disk.o
12obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o 12obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o
13obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o 13obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o
14obj-$(CONFIG_BLK_DEV_RAM) += rd.o 14obj-$(CONFIG_BLK_DEV_RAM) += brd.o
15obj-$(CONFIG_BLK_DEV_LOOP) += loop.o 15obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
16obj-$(CONFIG_BLK_DEV_PS2) += ps2esdi.o 16obj-$(CONFIG_BLK_DEV_PS2) += ps2esdi.o
17obj-$(CONFIG_BLK_DEV_XD) += xd.o 17obj-$(CONFIG_BLK_DEV_XD) += xd.o
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
new file mode 100644
index 000000000000..50b659bedc8f
--- /dev/null
+++ b/drivers/block/brd.c
@@ -0,0 +1,548 @@
1/*
2 * Ram backed block device driver.
3 *
4 * Copyright (C) 2007 Nick Piggin
5 * Copyright (C) 2007 Novell Inc.
6 *
7 * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
8 * of their respective owners.
9 */
10
11#include <linux/init.h>
12#include <linux/module.h>
13#include <linux/moduleparam.h>
14#include <linux/major.h>
15#include <linux/blkdev.h>
16#include <linux/bio.h>
17#include <linux/highmem.h>
18#include <linux/gfp.h>
19#include <linux/radix-tree.h>
20#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
21
22#include <asm/uaccess.h>
23
24#define SECTOR_SHIFT 9
25#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
26#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT)
27
28/*
29 * Each block ramdisk device has a radix_tree brd_pages of pages that stores
30 * the pages containing the block device's contents. A brd page's ->index is
31 * its offset in PAGE_SIZE units. This is similar to, but in no way connected
32 * with, the kernel's pagecache or buffer cache (which sit above our block
33 * device).
34 */
35struct brd_device {
36 int brd_number;
37 int brd_refcnt;
38 loff_t brd_offset;
39 loff_t brd_sizelimit;
40 unsigned brd_blocksize;
41
42 struct request_queue *brd_queue;
43 struct gendisk *brd_disk;
44 struct list_head brd_list;
45
46 /*
47 * Backing store of pages and lock to protect it. This is the contents
48 * of the block device.
49 */
50 spinlock_t brd_lock;
51 struct radix_tree_root brd_pages;
52};
53
54/*
55 * Look up and return a brd's page for a given sector.
56 */
57static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
58{
59 pgoff_t idx;
60 struct page *page;
61
62 /*
63 * The page lifetime is protected by the fact that we have opened the
64 * device node -- brd pages will never be deleted under us, so we
65 * don't need any further locking or refcounting.
66 *
67 * This is strictly true for the radix-tree nodes as well (ie. we
68 * don't actually need the rcu_read_lock()), however that is not a
69 * documented feature of the radix-tree API so it is better to be
70 * safe here (we don't have total exclusion from radix tree updates
71 * here, only deletes).
72 */
73 rcu_read_lock();
74 idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
75 page = radix_tree_lookup(&brd->brd_pages, idx);
76 rcu_read_unlock();
77
78 BUG_ON(page && page->index != idx);
79
80 return page;
81}
82
83/*
84 * Look up and return a brd's page for a given sector.
85 * If one does not exist, allocate an empty page, and insert that. Then
86 * return it.
87 */
88static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
89{
90 pgoff_t idx;
91 struct page *page;
92
93 page = brd_lookup_page(brd, sector);
94 if (page)
95 return page;
96
97 /*
98 * Must use NOIO because we don't want to recurse back into the
99 * block or filesystem layers from page reclaim.
100 */
101 page = alloc_page(GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO);
102 if (!page)
103 return NULL;
104
105 if (radix_tree_preload(GFP_NOIO)) {
106 __free_page(page);
107 return NULL;
108 }
109
110 spin_lock(&brd->brd_lock);
111 idx = sector >> PAGE_SECTORS_SHIFT;
112 if (radix_tree_insert(&brd->brd_pages, idx, page)) {
113 __free_page(page);
114 page = radix_tree_lookup(&brd->brd_pages, idx);
115 BUG_ON(!page);
116 BUG_ON(page->index != idx);
117 } else
118 page->index = idx;
119 spin_unlock(&brd->brd_lock);
120
121 radix_tree_preload_end();
122
123 return page;
124}
125
126/*
127 * Free all backing store pages and radix tree. This must only be called when
128 * there are no other users of the device.
129 */
130#define FREE_BATCH 16
131static void brd_free_pages(struct brd_device *brd)
132{
133 unsigned long pos = 0;
134 struct page *pages[FREE_BATCH];
135 int nr_pages;
136
137 do {
138 int i;
139
140 nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
141 (void **)pages, pos, FREE_BATCH);
142
143 for (i = 0; i < nr_pages; i++) {
144 void *ret;
145
146 BUG_ON(pages[i]->index < pos);
147 pos = pages[i]->index;
148 ret = radix_tree_delete(&brd->brd_pages, pos);
149 BUG_ON(!ret || ret != pages[i]);
150 __free_page(pages[i]);
151 }
152
153 pos++;
154
155 /*
156 * This assumes radix_tree_gang_lookup always returns as
157 * many pages as possible. If the radix-tree code changes,
158 * so will this have to.
159 */
160 } while (nr_pages == FREE_BATCH);
161}
162
163/*
164 * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
165 */
166static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
167{
168 unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
169 size_t copy;
170
171 copy = min_t(size_t, n, PAGE_SIZE - offset);
172 if (!brd_insert_page(brd, sector))
173 return -ENOMEM;
174 if (copy < n) {
175 sector += copy >> SECTOR_SHIFT;
176 if (!brd_insert_page(brd, sector))
177 return -ENOMEM;
178 }
179 return 0;
180}
181
182/*
183 * Copy n bytes from src to the brd starting at sector. Does not sleep.
184 */
185static void copy_to_brd(struct brd_device *brd, const void *src,
186 sector_t sector, size_t n)
187{
188 struct page *page;
189 void *dst;
190 unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
191 size_t copy;
192
193 copy = min_t(size_t, n, PAGE_SIZE - offset);
194 page = brd_lookup_page(brd, sector);
195 BUG_ON(!page);
196
197 dst = kmap_atomic(page, KM_USER1);
198 memcpy(dst + offset, src, copy);
199 kunmap_atomic(dst, KM_USER1);
200
201 if (copy < n) {
202 src += copy;
203 sector += copy >> SECTOR_SHIFT;
204 copy = n - copy;
205 page = brd_lookup_page(brd, sector);
206 BUG_ON(!page);
207
208 dst = kmap_atomic(page, KM_USER1);
209 memcpy(dst, src, copy);
210 kunmap_atomic(dst, KM_USER1);
211 }
212}
213
214/*
215 * Copy n bytes to dst from the brd starting at sector. Does not sleep.
216 */
217static void copy_from_brd(void *dst, struct brd_device *brd,
218 sector_t sector, size_t n)
219{
220 struct page *page;
221 void *src;
222 unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
223 size_t copy;
224
225 copy = min_t(size_t, n, PAGE_SIZE - offset);
226 page = brd_lookup_page(brd, sector);
227 if (page) {
228 src = kmap_atomic(page, KM_USER1);
229 memcpy(dst, src + offset, copy);
230 kunmap_atomic(src, KM_USER1);
231 } else
232 memset(dst, 0, copy);
233
234 if (copy < n) {
235 dst += copy;
236 sector += copy >> SECTOR_SHIFT;
237 copy = n - copy;
238 page = brd_lookup_page(brd, sector);
239 if (page) {
240 src = kmap_atomic(page, KM_USER1);
241 memcpy(dst, src, copy);
242 kunmap_atomic(src, KM_USER1);
243 } else
244 memset(dst, 0, copy);
245 }
246}
247
248/*
249 * Process a single bvec of a bio.
250 */
251static int brd_do_bvec(struct brd_device *brd, struct page *page,
252 unsigned int len, unsigned int off, int rw,
253 sector_t sector)
254{
255 void *mem;
256 int err = 0;
257
258 if (rw != READ) {
259 err = copy_to_brd_setup(brd, sector, len);
260 if (err)
261 goto out;
262 }
263
264 mem = kmap_atomic(page, KM_USER0);
265 if (rw == READ) {
266 copy_from_brd(mem + off, brd, sector, len);
267 flush_dcache_page(page);
268 } else
269 copy_to_brd(brd, mem + off, sector, len);
270 kunmap_atomic(mem, KM_USER0);
271
272out:
273 return err;
274}
275
276static int brd_make_request(struct request_queue *q, struct bio *bio)
277{
278 struct block_device *bdev = bio->bi_bdev;
279 struct brd_device *brd = bdev->bd_disk->private_data;
280 int rw;
281 struct bio_vec *bvec;
282 sector_t sector;
283 int i;
284 int err = -EIO;
285
286 sector = bio->bi_sector;
287 if (sector + (bio->bi_size >> SECTOR_SHIFT) >
288 get_capacity(bdev->bd_disk))
289 goto out;
290
291 rw = bio_rw(bio);
292 if (rw == READA)
293 rw = READ;
294
295 bio_for_each_segment(bvec, bio, i) {
296 unsigned int len = bvec->bv_len;
297 err = brd_do_bvec(brd, bvec->bv_page, len,
298 bvec->bv_offset, rw, sector);
299 if (err)
300 break;
301 sector += len >> SECTOR_SHIFT;
302 }
303
304out:
305 bio_endio(bio, err);
306
307 return 0;
308}
309
310static int brd_ioctl(struct inode *inode, struct file *file,
311 unsigned int cmd, unsigned long arg)
312{
313 int error;
314 struct block_device *bdev = inode->i_bdev;
315 struct brd_device *brd = bdev->bd_disk->private_data;
316
317 if (cmd != BLKFLSBUF)
318 return -ENOTTY;
319
320 /*
321 * ram device BLKFLSBUF has special semantics, we want to actually
322 * release and destroy the ramdisk data.
323 */
324 mutex_lock(&bdev->bd_mutex);
325 error = -EBUSY;
326 if (bdev->bd_openers <= 1) {
327 /*
328 * Invalidate the cache first, so it isn't written
329 * back to the device.
330 *
331 * Another thread might instantiate more buffercache here,
332 * but there is not much we can do to close that race.
333 */
334 invalidate_bh_lrus();
335 truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
336 brd_free_pages(brd);
337 error = 0;
338 }
339 mutex_unlock(&bdev->bd_mutex);
340
341 return error;
342}
343
344static struct block_device_operations brd_fops = {
345 .owner = THIS_MODULE,
346 .ioctl = brd_ioctl,
347};
348
349/*
350 * And now the modules code and kernel interface.
351 */
352static int rd_nr;
353int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
354module_param(rd_nr, int, 0);
355MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
356module_param(rd_size, int, 0);
357MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
358MODULE_LICENSE("GPL");
359MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
360
361#ifndef MODULE
362/* Legacy boot options - nonmodular */
363static int __init ramdisk_size(char *str)
364{
365 rd_size = simple_strtol(str, NULL, 0);
366 return 1;
367}
368static int __init ramdisk_size2(char *str)
369{
370 return ramdisk_size(str);
371}
372__setup("ramdisk=", ramdisk_size);
373__setup("ramdisk_size=", ramdisk_size2);
374#endif
375
376/*
377 * The device scheme is derived from loop.c. Keep them in synch where possible
378 * (should share code eventually).
379 */
380static LIST_HEAD(brd_devices);
381static DEFINE_MUTEX(brd_devices_mutex);
382
383static struct brd_device *brd_alloc(int i)
384{
385 struct brd_device *brd;
386 struct gendisk *disk;
387
388 brd = kzalloc(sizeof(*brd), GFP_KERNEL);
389 if (!brd)
390 goto out;
391 brd->brd_number = i;
392 spin_lock_init(&brd->brd_lock);
393 INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
394
395 brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
396 if (!brd->brd_queue)
397 goto out_free_dev;
398 blk_queue_make_request(brd->brd_queue, brd_make_request);
399 blk_queue_max_sectors(brd->brd_queue, 1024);
400 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
401
402 disk = brd->brd_disk = alloc_disk(1);
403 if (!disk)
404 goto out_free_queue;
405 disk->major = RAMDISK_MAJOR;
406 disk->first_minor = i;
407 disk->fops = &brd_fops;
408 disk->private_data = brd;
409 disk->queue = brd->brd_queue;
410 sprintf(disk->disk_name, "ram%d", i);
411 set_capacity(disk, rd_size * 2);
412
413 return brd;
414
415out_free_queue:
416 blk_cleanup_queue(brd->brd_queue);
417out_free_dev:
418 kfree(brd);
419out:
420 return NULL;
421}
422
423static void brd_free(struct brd_device *brd)
424{
425 put_disk(brd->brd_disk);
426 blk_cleanup_queue(brd->brd_queue);
427 brd_free_pages(brd);
428 kfree(brd);
429}
430
431static struct brd_device *brd_init_one(int i)
432{
433 struct brd_device *brd;
434
435 list_for_each_entry(brd, &brd_devices, brd_list) {
436 if (brd->brd_number == i)
437 goto out;
438 }
439
440 brd = brd_alloc(i);
441 if (brd) {
442 add_disk(brd->brd_disk);
443 list_add_tail(&brd->brd_list, &brd_devices);
444 }
445out:
446 return brd;
447}
448
449static void brd_del_one(struct brd_device *brd)
450{
451 list_del(&brd->brd_list);
452 del_gendisk(brd->brd_disk);
453 brd_free(brd);
454}
455
456static struct kobject *brd_probe(dev_t dev, int *part, void *data)
457{
458 struct brd_device *brd;
459 struct kobject *kobj;
460
461 mutex_lock(&brd_devices_mutex);
462 brd = brd_init_one(dev & MINORMASK);
463 kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM);
464 mutex_unlock(&brd_devices_mutex);
465
466 *part = 0;
467 return kobj;
468}
469
470static int __init brd_init(void)
471{
472 int i, nr;
473 unsigned long range;
474 struct brd_device *brd, *next;
475
476 /*
477 * brd module now has a feature to instantiate underlying device
478 * structure on-demand, provided that there is an access dev node.
479 * However, this will not work well with user space tool that doesn't
480 * know about such "feature". In order to not break any existing
481 * tool, we do the following:
482 *
483 * (1) if rd_nr is specified, create that many upfront, and this
484 * also becomes a hard limit.
485 * (2) if rd_nr is not specified, create 1 rd device on module
486 * load, user can further extend brd device by create dev node
487 * themselves and have kernel automatically instantiate actual
488 * device on-demand.
489 */
490 if (rd_nr > 1UL << MINORBITS)
491 return -EINVAL;
492
493 if (rd_nr) {
494 nr = rd_nr;
495 range = rd_nr;
496 } else {
497 nr = CONFIG_BLK_DEV_RAM_COUNT;
498 range = 1UL << MINORBITS;
499 }
500
501 if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
502 return -EIO;
503
504 for (i = 0; i < nr; i++) {
505 brd = brd_alloc(i);
506 if (!brd)
507 goto out_free;
508 list_add_tail(&brd->brd_list, &brd_devices);
509 }
510
511 /* point of no return */
512
513 list_for_each_entry(brd, &brd_devices, brd_list)
514 add_disk(brd->brd_disk);
515
516 blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range,
517 THIS_MODULE, brd_probe, NULL, NULL);
518
519 printk(KERN_INFO "brd: module loaded\n");
520 return 0;
521
522out_free:
523 list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
524 list_del(&brd->brd_list);
525 brd_free(brd);
526 }
527
528 unregister_blkdev(RAMDISK_MAJOR, "brd");
529 return -ENOMEM;
530}
531
532static void __exit brd_exit(void)
533{
534 unsigned long range;
535 struct brd_device *brd, *next;
536
537 range = rd_nr ? rd_nr : 1UL << MINORBITS;
538
539 list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
540 brd_del_one(brd);
541
542 blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range);
543 unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
544}
545
546module_init(brd_init);
547module_exit(brd_exit);
548
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
deleted file mode 100644
index 06e23be70904..000000000000
--- a/drivers/block/rd.c
+++ /dev/null
@@ -1,537 +0,0 @@
1/*
2 * ramdisk.c - Multiple RAM disk driver - gzip-loading version - v. 0.8 beta.
3 *
4 * (C) Chad Page, Theodore Ts'o, et. al, 1995.
5 *
6 * This RAM disk is designed to have filesystems created on it and mounted
7 * just like a regular floppy disk.
8 *
9 * It also does something suggested by Linus: use the buffer cache as the
10 * RAM disk data. This makes it possible to dynamically allocate the RAM disk
11 * buffer - with some consequences I have to deal with as I write this.
12 *
13 * This code is based on the original ramdisk.c, written mostly by
14 * Theodore Ts'o (TYT) in 1991. The code was largely rewritten by
15 * Chad Page to use the buffer cache to store the RAM disk data in
16 * 1995; Theodore then took over the driver again, and cleaned it up
17 * for inclusion in the mainline kernel.
18 *
19 * The original CRAMDISK code was written by Richard Lyons, and
20 * adapted by Chad Page to use the new RAM disk interface. Theodore
21 * Ts'o rewrote it so that both the compressed RAM disk loader and the
22 * kernel decompressor uses the same inflate.c codebase. The RAM disk
23 * loader now also loads into a dynamic (buffer cache based) RAM disk,
24 * not the old static RAM disk. Support for the old static RAM disk has
25 * been completely removed.
26 *
27 * Loadable module support added by Tom Dyas.
28 *
29 * Further cleanups by Chad Page (page0588@sundance.sjsu.edu):
30 * Cosmetic changes in #ifdef MODULE, code movement, etc.
31 * When the RAM disk module is removed, free the protected buffers
32 * Default RAM disk size changed to 2.88 MB
33 *
34 * Added initrd: Werner Almesberger & Hans Lermen, Feb '96
35 *
36 * 4/25/96 : Made RAM disk size a parameter (default is now 4 MB)
37 * - Chad Page
38 *
39 * Add support for fs images split across >1 disk, Paul Gortmaker, Mar '98
40 *
41 * Make block size and block size shift for RAM disks a global macro
42 * and set blk_size for -ENOSPC, Werner Fink <werner@suse.de>, Apr '99
43 */
44
45#include <linux/string.h>
46#include <linux/slab.h>
47#include <asm/atomic.h>
48#include <linux/bio.h>
49#include <linux/module.h>
50#include <linux/moduleparam.h>
51#include <linux/init.h>
52#include <linux/pagemap.h>
53#include <linux/blkdev.h>
54#include <linux/genhd.h>
55#include <linux/buffer_head.h> /* for invalidate_bdev() */
56#include <linux/backing-dev.h>
57#include <linux/blkpg.h>
58#include <linux/writeback.h>
59#include <linux/log2.h>
60
61#include <asm/uaccess.h>
62
63/* Various static variables go here. Most are used only in the RAM disk code.
64 */
65
66static struct gendisk *rd_disks[CONFIG_BLK_DEV_RAM_COUNT];
67static struct block_device *rd_bdev[CONFIG_BLK_DEV_RAM_COUNT];/* Protected device data */
68static struct request_queue *rd_queue[CONFIG_BLK_DEV_RAM_COUNT];
69
70/*
71 * Parameters for the boot-loading of the RAM disk. These are set by
72 * init/main.c (from arguments to the kernel command line) or from the
73 * architecture-specific setup routine (from the stored boot sector
74 * information).
75 */
76int rd_size = CONFIG_BLK_DEV_RAM_SIZE; /* Size of the RAM disks */
77/*
78 * It would be very desirable to have a soft-blocksize (that in the case
79 * of the ramdisk driver is also the hardblocksize ;) of PAGE_SIZE because
80 * doing that we'll achieve a far better MM footprint. Using a rd_blocksize of
81 * BLOCK_SIZE in the worst case we'll make PAGE_SIZE/BLOCK_SIZE buffer-pages
82 * unfreeable. With a rd_blocksize of PAGE_SIZE instead we are sure that only
83 * 1 page will be protected. Depending on the size of the ramdisk you
84 * may want to change the ramdisk blocksize to achieve a better or worse MM
85 * behaviour. The default is still BLOCK_SIZE (needed by rd_load_image that
86 * supposes the filesystem in the image uses a BLOCK_SIZE blocksize).
87 */
88static int rd_blocksize = CONFIG_BLK_DEV_RAM_BLOCKSIZE;
89
90/*
91 * Copyright (C) 2000 Linus Torvalds.
92 * 2000 Transmeta Corp.
93 * aops copied from ramfs.
94 */
95
96/*
97 * If a ramdisk page has buffers, some may be uptodate and some may be not.
98 * To bring the page uptodate we zero out the non-uptodate buffers. The
99 * page must be locked.
100 */
101static void make_page_uptodate(struct page *page)
102{
103 if (page_has_buffers(page)) {
104 struct buffer_head *bh = page_buffers(page);
105 struct buffer_head *head = bh;
106
107 do {
108 if (!buffer_uptodate(bh)) {
109 memset(bh->b_data, 0, bh->b_size);
110 /*
111 * akpm: I'm totally undecided about this. The
112 * buffer has just been magically brought "up to
113 * date", but nobody should want to be reading
114 * it anyway, because it hasn't been used for
115 * anything yet. It is still in a "not read
116 * from disk yet" state.
117 *
118 * But non-uptodate buffers against an uptodate
119 * page are against the rules. So do it anyway.
120 */
121 set_buffer_uptodate(bh);
122 }
123 } while ((bh = bh->b_this_page) != head);
124 } else {
125 memset(page_address(page), 0, PAGE_CACHE_SIZE);
126 }
127 flush_dcache_page(page);
128 SetPageUptodate(page);
129}
130
131static int ramdisk_readpage(struct file *file, struct page *page)
132{
133 if (!PageUptodate(page))
134 make_page_uptodate(page);
135 unlock_page(page);
136 return 0;
137}
138
139static int ramdisk_prepare_write(struct file *file, struct page *page,
140 unsigned offset, unsigned to)
141{
142 if (!PageUptodate(page))
143 make_page_uptodate(page);
144 return 0;
145}
146
147static int ramdisk_commit_write(struct file *file, struct page *page,
148 unsigned offset, unsigned to)
149{
150 set_page_dirty(page);
151 return 0;
152}
153
154/*
155 * ->writepage to the blockdev's mapping has to redirty the page so that the
156 * VM doesn't go and steal it. We return AOP_WRITEPAGE_ACTIVATE so that the VM
157 * won't try to (pointlessly) write the page again for a while.
158 *
159 * Really, these pages should not be on the LRU at all.
160 */
161static int ramdisk_writepage(struct page *page, struct writeback_control *wbc)
162{
163 if (!PageUptodate(page))
164 make_page_uptodate(page);
165 SetPageDirty(page);
166 if (wbc->for_reclaim)
167 return AOP_WRITEPAGE_ACTIVATE;
168 unlock_page(page);
169 return 0;
170}
171
172/*
173 * This is a little speedup thing: short-circuit attempts to write back the
174 * ramdisk blockdev inode to its non-existent backing store.
175 */
176static int ramdisk_writepages(struct address_space *mapping,
177 struct writeback_control *wbc)
178{
179 return 0;
180}
181
182/*
183 * ramdisk blockdev pages have their own ->set_page_dirty() because we don't
184 * want them to contribute to dirty memory accounting.
185 */
186static int ramdisk_set_page_dirty(struct page *page)
187{
188 if (!TestSetPageDirty(page))
189 return 1;
190 return 0;
191}
192
193/*
194 * releasepage is called by pagevec_strip/try_to_release_page if
195 * buffers_heads_over_limit is true. Without a releasepage function
196 * try_to_free_buffers is called instead. That can unset the dirty
197 * bit of our ram disk pages, which will be eventually freed, even
198 * if the page is still in use.
199 */
200static int ramdisk_releasepage(struct page *page, gfp_t dummy)
201{
202 return 0;
203}
204
205static const struct address_space_operations ramdisk_aops = {
206 .readpage = ramdisk_readpage,
207 .prepare_write = ramdisk_prepare_write,
208 .commit_write = ramdisk_commit_write,
209 .writepage = ramdisk_writepage,
210 .set_page_dirty = ramdisk_set_page_dirty,
211 .writepages = ramdisk_writepages,
212 .releasepage = ramdisk_releasepage,
213};
214
215static int rd_blkdev_pagecache_IO(int rw, struct bio_vec *vec, sector_t sector,
216 struct address_space *mapping)
217{
218 pgoff_t index = sector >> (PAGE_CACHE_SHIFT - 9);
219 unsigned int vec_offset = vec->bv_offset;
220 int offset = (sector << 9) & ~PAGE_CACHE_MASK;
221 int size = vec->bv_len;
222 int err = 0;
223
224 do {
225 int count;
226 struct page *page;
227 char *src;
228 char *dst;
229
230 count = PAGE_CACHE_SIZE - offset;
231 if (count > size)
232 count = size;
233 size -= count;
234
235 page = grab_cache_page(mapping, index);
236 if (!page) {
237 err = -ENOMEM;
238 goto out;
239 }
240
241 if (!PageUptodate(page))
242 make_page_uptodate(page);
243
244 index++;
245
246 if (rw == READ) {
247 src = kmap_atomic(page, KM_USER0) + offset;
248 dst = kmap_atomic(vec->bv_page, KM_USER1) + vec_offset;
249 } else {
250 src = kmap_atomic(vec->bv_page, KM_USER0) + vec_offset;
251 dst = kmap_atomic(page, KM_USER1) + offset;
252 }
253 offset = 0;
254 vec_offset += count;
255
256 memcpy(dst, src, count);
257
258 kunmap_atomic(src, KM_USER0);
259 kunmap_atomic(dst, KM_USER1);
260
261 if (rw == READ)
262 flush_dcache_page(vec->bv_page);
263 else
264 set_page_dirty(page);
265 unlock_page(page);
266 put_page(page);
267 } while (size);
268
269 out:
270 return err;
271}
272
273/*
274 * Basically, my strategy here is to set up a buffer-head which can't be
275 * deleted, and make that my Ramdisk. If the request is outside of the
276 * allocated size, we must get rid of it...
277 *
278 * 19-JAN-1998 Richard Gooch <rgooch@atnf.csiro.au> Added devfs support
279 *
280 */
281static int rd_make_request(struct request_queue *q, struct bio *bio)
282{
283 struct block_device *bdev = bio->bi_bdev;
284 struct address_space * mapping = bdev->bd_inode->i_mapping;
285 sector_t sector = bio->bi_sector;
286 unsigned long len = bio->bi_size >> 9;
287 int rw = bio_data_dir(bio);
288 struct bio_vec *bvec;
289 int ret = 0, i;
290
291 if (sector + len > get_capacity(bdev->bd_disk))
292 goto fail;
293
294 if (rw==READA)
295 rw=READ;
296
297 bio_for_each_segment(bvec, bio, i) {
298 ret |= rd_blkdev_pagecache_IO(rw, bvec, sector, mapping);
299 sector += bvec->bv_len >> 9;
300 }
301 if (ret)
302 goto fail;
303
304 bio_endio(bio, 0);
305 return 0;
306fail:
307 bio_io_error(bio);
308 return 0;
309}
310
311static int rd_ioctl(struct inode *inode, struct file *file,
312 unsigned int cmd, unsigned long arg)
313{
314 int error;
315 struct block_device *bdev = inode->i_bdev;
316
317 if (cmd != BLKFLSBUF)
318 return -ENOTTY;
319
320 /*
321 * special: we want to release the ramdisk memory, it's not like with
322 * the other blockdevices where this ioctl only flushes away the buffer
323 * cache
324 */
325 error = -EBUSY;
326 mutex_lock(&bdev->bd_mutex);
327 if (bdev->bd_openers <= 2) {
328 truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
329 error = 0;
330 }
331 mutex_unlock(&bdev->bd_mutex);
332 return error;
333}
334
335/*
336 * This is the backing_dev_info for the blockdev inode itself. It doesn't need
337 * writeback and it does not contribute to dirty memory accounting.
338 */
339static struct backing_dev_info rd_backing_dev_info = {
340 .ra_pages = 0, /* No readahead */
341 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK | BDI_CAP_MAP_COPY,
342 .unplug_io_fn = default_unplug_io_fn,
343};
344
345/*
346 * This is the backing_dev_info for the files which live atop the ramdisk
347 * "device". These files do need writeback and they do contribute to dirty
348 * memory accounting.
349 */
350static struct backing_dev_info rd_file_backing_dev_info = {
351 .ra_pages = 0, /* No readahead */
352 .capabilities = BDI_CAP_MAP_COPY, /* Does contribute to dirty memory */
353 .unplug_io_fn = default_unplug_io_fn,
354};
355
356static int rd_open(struct inode *inode, struct file *filp)
357{
358 unsigned unit = iminor(inode);
359
360 if (rd_bdev[unit] == NULL) {
361 struct block_device *bdev = inode->i_bdev;
362 struct address_space *mapping;
363 unsigned bsize;
364 gfp_t gfp_mask;
365
366 inode = igrab(bdev->bd_inode);
367 rd_bdev[unit] = bdev;
368 bdev->bd_openers++;
369 bsize = bdev_hardsect_size(bdev);
370 bdev->bd_block_size = bsize;
371 inode->i_blkbits = blksize_bits(bsize);
372 inode->i_size = get_capacity(bdev->bd_disk)<<9;
373
374 mapping = inode->i_mapping;
375 mapping->a_ops = &ramdisk_aops;
376 mapping->backing_dev_info = &rd_backing_dev_info;
377 bdev->bd_inode_backing_dev_info = &rd_file_backing_dev_info;
378
379 /*
380 * Deep badness. rd_blkdev_pagecache_IO() needs to allocate
381 * pagecache pages within a request_fn. We cannot recur back
382 * into the filesystem which is mounted atop the ramdisk, because
383 * that would deadlock on fs locks. And we really don't want
384 * to reenter rd_blkdev_pagecache_IO when we're already within
385 * that function.
386 *
387 * So we turn off __GFP_FS and __GFP_IO.
388 *
389 * And to give this thing a hope of working, turn on __GFP_HIGH.
390 * Hopefully, there's enough regular memory allocation going on
391 * for the page allocator emergency pools to keep the ramdisk
392 * driver happy.
393 */
394 gfp_mask = mapping_gfp_mask(mapping);
395 gfp_mask &= ~(__GFP_FS|__GFP_IO);
396 gfp_mask |= __GFP_HIGH;
397 mapping_set_gfp_mask(mapping, gfp_mask);
398 }
399
400 return 0;
401}
402
403static struct block_device_operations rd_bd_op = {
404 .owner = THIS_MODULE,
405 .open = rd_open,
406 .ioctl = rd_ioctl,
407};
408
409/*
410 * Before freeing the module, invalidate all of the protected buffers!
411 */
412static void __exit rd_cleanup(void)
413{
414 int i;
415
416 for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
417 struct block_device *bdev = rd_bdev[i];
418 rd_bdev[i] = NULL;
419 if (bdev) {
420 invalidate_bdev(bdev);
421 blkdev_put(bdev);
422 }
423 del_gendisk(rd_disks[i]);
424 put_disk(rd_disks[i]);
425 blk_cleanup_queue(rd_queue[i]);
426 }
427 unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
428
429 bdi_destroy(&rd_file_backing_dev_info);
430 bdi_destroy(&rd_backing_dev_info);
431}
432
433/*
434 * This is the registration and initialization section of the RAM disk driver
435 */
436static int __init rd_init(void)
437{
438 int i;
439 int err;
440
441 err = bdi_init(&rd_backing_dev_info);
442 if (err)
443 goto out2;
444
445 err = bdi_init(&rd_file_backing_dev_info);
446 if (err) {
447 bdi_destroy(&rd_backing_dev_info);
448 goto out2;
449 }
450
451 err = -ENOMEM;
452
453 if (rd_blocksize > PAGE_SIZE || rd_blocksize < 512 ||
454 !is_power_of_2(rd_blocksize)) {
455 printk("RAMDISK: wrong blocksize %d, reverting to defaults\n",
456 rd_blocksize);
457 rd_blocksize = BLOCK_SIZE;
458 }
459
460 for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
461 rd_disks[i] = alloc_disk(1);
462 if (!rd_disks[i])
463 goto out;
464
465 rd_queue[i] = blk_alloc_queue(GFP_KERNEL);
466 if (!rd_queue[i]) {
467 put_disk(rd_disks[i]);
468 goto out;
469 }
470 }
471
472 if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) {
473 err = -EIO;
474 goto out;
475 }
476
477 for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
478 struct gendisk *disk = rd_disks[i];
479
480 blk_queue_make_request(rd_queue[i], &rd_make_request);
481 blk_queue_hardsect_size(rd_queue[i], rd_blocksize);
482
483 /* rd_size is given in kB */
484 disk->major = RAMDISK_MAJOR;
485 disk->first_minor = i;
486 disk->fops = &rd_bd_op;
487 disk->queue = rd_queue[i];
488 disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
489 sprintf(disk->disk_name, "ram%d", i);
490 set_capacity(disk, rd_size * 2);
491 add_disk(rd_disks[i]);
492 }
493
494 /* rd_size is given in kB */
495 printk("RAMDISK driver initialized: "
496 "%d RAM disks of %dK size %d blocksize\n",
497 CONFIG_BLK_DEV_RAM_COUNT, rd_size, rd_blocksize);
498
499 return 0;
500out:
501 while (i--) {
502 put_disk(rd_disks[i]);
503 blk_cleanup_queue(rd_queue[i]);
504 }
505 bdi_destroy(&rd_backing_dev_info);
506 bdi_destroy(&rd_file_backing_dev_info);
507out2:
508 return err;
509}
510
511module_init(rd_init);
512module_exit(rd_cleanup);
513
514/* options - nonmodular */
515#ifndef MODULE
516static int __init ramdisk_size(char *str)
517{
518 rd_size = simple_strtol(str,NULL,0);
519 return 1;
520}
521static int __init ramdisk_blocksize(char *str)
522{
523 rd_blocksize = simple_strtol(str,NULL,0);
524 return 1;
525}
526__setup("ramdisk_size=", ramdisk_size);
527__setup("ramdisk_blocksize=", ramdisk_blocksize);
528#endif
529
530/* options - modular */
531module_param(rd_size, int, 0);
532MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
533module_param(rd_blocksize, int, 0);
534MODULE_PARM_DESC(rd_blocksize, "Blocksize of each RAM disk in bytes.");
535MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
536
537MODULE_LICENSE("GPL");