rewrite rd

This is a rewrite of the ramdisk block device driver. The old one is really difficult because it effectively implements a block device which serves data out of its own buffer cache. It relies on the dirty bit being set, to pin its backing store in cache, however there are non trivial paths which can clear the dirty bit (eg. try_to_free_buffers()), which had recently lead to data corruption. And in general it is completely wrong for a block device driver to do this. The new one is more like a regular block device driver. It has no idea about vm/vfs stuff. It's backing store is similar to the buffer cache (a simple radix-tree of pages), but it doesn't know anything about page cache (the pages in the radix tree are not pagecache pages). There is one slight downside -- direct block device access and filesystem metadata access goes through an extra copy and gets stored in RAM twice. However, this downside is only slight, because the real buffercache of the device is now reclaimable (because we're not playing crazy games with it), so under memory intensive situations, footprint should effectively be the same -- maybe even a slight advantage to the new driver because it can also reclaim buffer heads. The fact that it now goes through all the regular vm/fs paths makes it much more useful for testing, too. text data bss dec hex filename 2837 849 384 4070 fe6 drivers/block/rd.o 3528 371 12 3911 f47 drivers/block/brd.o Text is larger, but data and bss are smaller, making total size smaller. A few other nice things about it: - Similar structure and layout to the new loop device handlinag. - Dynamic ramdisk creation. - Runtime flexible buffer head size (because it is no longer part of the ramdisk code). - Boot / load time flexible ramdisk size, which could easily be extended to a per-ramdisk runtime changeable size (eg. with an ioctl). - Can use highmem for the backing store. [akpm@linux-foundation.org: fix build] [byron.bbradley@gmail.com: make rd_size non-static] Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Byron Bradley <byron.bbradley@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Nick Piggin <npiggin@suse.de> 2008-02-08 07:19:49 -0500
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2008-02-08 12:22:30 -0500
commit: 9db5579be4bb5320c3248f6acf807aedf05ae143 (patch)
tree: fde09bbeb427946b30d2e0fb6b00494a42488052 /drivers/block/brd.c
parent: daeb51e62cacde31c8245866e1096ff79a0c83fe (diff)
1 files changed, 548 insertions, 0 deletions
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
new file mode 100644
index 000000000000..50b659bedc8f
--- /dev/null
+++ b/drivers/block/brd.c
@@ -0,0 +1,548 @@
+/*
+ * Ram backed block device driver.
+ *
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ *
+ * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
+ * of their respective owners.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/radix-tree.h>
+#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
+#include <asm/uaccess.h>
+#define SECTOR_SHIFT            9
+#define PAGE_SECTORS_SHIFT      (PAGE_SHIFT - SECTOR_SHIFT)
+#define PAGE_SECTORS            (1 << PAGE_SECTORS_SHIFT)
+/*
+ * Each block ramdisk device has a radix_tree brd_pages of pages that stores
+ * the pages containing the block device's contents. A brd page's ->index is
+ * its offset in PAGE_SIZE units. This is similar to, but in no way connected
+ * with, the kernel's pagecache or buffer cache (which sit above our block
+ * device).
+ */
+struct brd_device {
+        int             brd_number;
+        int             brd_refcnt;
+        loff_t          brd_offset;
+        loff_t          brd_sizelimit;
+        unsigned        brd_blocksize;
+        struct request_queue    *brd_queue;
+        struct gendisk          *brd_disk;
+        struct list_head        brd_list;
+        /*
+         * Backing store of pages and lock to protect it. This is the contents
+         * of the block device.
+         */
+        spinlock_t              brd_lock;
+        struct radix_tree_root  brd_pages;
+};
+/*
+ * Look up and return a brd's page for a given sector.
+ */
+static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
+{
+        pgoff_t idx;
+        struct page *page;
+        /*
+         * The page lifetime is protected by the fact that we have opened the
+         * device node -- brd pages will never be deleted under us, so we
+         * don't need any further locking or refcounting.
+         *
+         * This is strictly true for the radix-tree nodes as well (ie. we
+         * don't actually need the rcu_read_lock()), however that is not a
+         * documented feature of the radix-tree API so it is better to be
+         * safe here (we don't have total exclusion from radix tree updates
+         * here, only deletes).
+         */
+        rcu_read_lock();
+        idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
+        page = radix_tree_lookup(&brd->brd_pages, idx);
+        rcu_read_unlock();
+        BUG_ON(page && page->index != idx);
+        return page;
+}
+/*
+ * Look up and return a brd's page for a given sector.
+ * If one does not exist, allocate an empty page, and insert that. Then
+ * return it.
+ */
+static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
+{
+        pgoff_t idx;
+        struct page *page;
+        page = brd_lookup_page(brd, sector);
+        if (page)
+                return page;
+        /*
+         * Must use NOIO because we don't want to recurse back into the
+         * block or filesystem layers from page reclaim.
+         */
+        page = alloc_page(GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO);
+        if (!page)
+                return NULL;
+        if (radix_tree_preload(GFP_NOIO)) {
+                __free_page(page);
+                return NULL;
+        }
+        spin_lock(&brd->brd_lock);
+        idx = sector >> PAGE_SECTORS_SHIFT;
+        if (radix_tree_insert(&brd->brd_pages, idx, page)) {
+                __free_page(page);
+                page = radix_tree_lookup(&brd->brd_pages, idx);
+                BUG_ON(!page);
+                BUG_ON(page->index != idx);
+        } else
+                page->index = idx;
+        spin_unlock(&brd->brd_lock);
+        radix_tree_preload_end();
+        return page;
+}
+/*
+ * Free all backing store pages and radix tree. This must only be called when
+ * there are no other users of the device.
+ */
+#define FREE_BATCH 16
+static void brd_free_pages(struct brd_device *brd)
+{
+        unsigned long pos = 0;
+        struct page *pages[FREE_BATCH];
+        int nr_pages;
+        do {
+                int i;
+                nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
+                                (void **)pages, pos, FREE_BATCH);
+                for (i = 0; i < nr_pages; i++) {
+                        void *ret;
+                        BUG_ON(pages[i]->index < pos);
+                        pos = pages[i]->index;
+                        ret = radix_tree_delete(&brd->brd_pages, pos);
+                        BUG_ON(!ret || ret != pages[i]);
+                        __free_page(pages[i]);
+                }
+                pos++;
+                /*
+                 * This assumes radix_tree_gang_lookup always returns as
+                 * many pages as possible. If the radix-tree code changes,
+                 * so will this have to.
+                 */
+        } while (nr_pages == FREE_BATCH);
+}
+/*
+ * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
+ */
+static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
+{
+        unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+        size_t copy;
+        copy = min_t(size_t, n, PAGE_SIZE - offset);
+        if (!brd_insert_page(brd, sector))
+                return -ENOMEM;
+        if (copy < n) {
+                sector += copy >> SECTOR_SHIFT;
+                if (!brd_insert_page(brd, sector))
+                        return -ENOMEM;
+        }
+        return 0;
+}
+/*
+ * Copy n bytes from src to the brd starting at sector. Does not sleep.
+ */
+static void copy_to_brd(struct brd_device *brd, const void *src,
+                        sector_t sector, size_t n)
+{
+        struct page *page;
+        void *dst;
+        unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+        size_t copy;
+        copy = min_t(size_t, n, PAGE_SIZE - offset);
+        page = brd_lookup_page(brd, sector);
+        BUG_ON(!page);
+        dst = kmap_atomic(page, KM_USER1);
+        memcpy(dst + offset, src, copy);
+        kunmap_atomic(dst, KM_USER1);
+        if (copy < n) {
+                src += copy;
+                sector += copy >> SECTOR_SHIFT;
+                copy = n - copy;
+                page = brd_lookup_page(brd, sector);
+                BUG_ON(!page);
+                dst = kmap_atomic(page, KM_USER1);
+                memcpy(dst, src, copy);
+                kunmap_atomic(dst, KM_USER1);
+        }
+}
+/*
+ * Copy n bytes to dst from the brd starting at sector. Does not sleep.
+ */
+static void copy_from_brd(void *dst, struct brd_device *brd,
+                        sector_t sector, size_t n)
+{
+        struct page *page;
+        void *src;
+        unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+        size_t copy;
+        copy = min_t(size_t, n, PAGE_SIZE - offset);
+        page = brd_lookup_page(brd, sector);
+        if (page) {
+                src = kmap_atomic(page, KM_USER1);
+                memcpy(dst, src + offset, copy);
+                kunmap_atomic(src, KM_USER1);
+        } else
+                memset(dst, 0, copy);
+        if (copy < n) {
+                dst += copy;
+                sector += copy >> SECTOR_SHIFT;
+                copy = n - copy;
+                page = brd_lookup_page(brd, sector);
+                if (page) {
+                        src = kmap_atomic(page, KM_USER1);
+                        memcpy(dst, src, copy);
+                        kunmap_atomic(src, KM_USER1);
+                } else
+                        memset(dst, 0, copy);
+        }
+}
+/*
+ * Process a single bvec of a bio.
+ */
+static int brd_do_bvec(struct brd_device *brd, struct page *page,
+                        unsigned int len, unsigned int off, int rw,
+                        sector_t sector)
+{
+        void *mem;
+        int err = 0;
+        if (rw != READ) {
+                err = copy_to_brd_setup(brd, sector, len);
+                if (err)
+                        goto out;
+        }
+        mem = kmap_atomic(page, KM_USER0);
+        if (rw == READ) {
+                copy_from_brd(mem + off, brd, sector, len);
+                flush_dcache_page(page);
+        } else
+                copy_to_brd(brd, mem + off, sector, len);
+        kunmap_atomic(mem, KM_USER0);
+out:
+        return err;
+}
+static int brd_make_request(struct request_queue *q, struct bio *bio)
+{
+        struct block_device *bdev = bio->bi_bdev;
+        struct brd_device *brd = bdev->bd_disk->private_data;
+        int rw;
+        struct bio_vec *bvec;
+        sector_t sector;
+        int i;
+        int err = -EIO;
+        sector = bio->bi_sector;
+        if (sector + (bio->bi_size >> SECTOR_SHIFT) >
+                                                get_capacity(bdev->bd_disk))
+                goto out;
+        rw = bio_rw(bio);
+        if (rw == READA)
+                rw = READ;
+        bio_for_each_segment(bvec, bio, i) {
+                unsigned int len = bvec->bv_len;
+                err = brd_do_bvec(brd, bvec->bv_page, len,
+                                        bvec->bv_offset, rw, sector);
+                if (err)
+                        break;
+                sector += len >> SECTOR_SHIFT;
+        }
+out:
+        bio_endio(bio, err);
+        return 0;
+}
+static int brd_ioctl(struct inode *inode, struct file *file,
+                        unsigned int cmd, unsigned long arg)
+{
+        int error;
+        struct block_device *bdev = inode->i_bdev;
+        struct brd_device *brd = bdev->bd_disk->private_data;
+        if (cmd != BLKFLSBUF)
+                return -ENOTTY;
+        /*
+         * ram device BLKFLSBUF has special semantics, we want to actually
+         * release and destroy the ramdisk data.
+         */
+        mutex_lock(&bdev->bd_mutex);
+        error = -EBUSY;
+        if (bdev->bd_openers <= 1) {
+                /*
+                 * Invalidate the cache first, so it isn't written
+                 * back to the device.
+                 *
+                 * Another thread might instantiate more buffercache here,
+                 * but there is not much we can do to close that race.
+                 */
+                invalidate_bh_lrus();
+                truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
+                brd_free_pages(brd);
+                error = 0;
+        }
+        mutex_unlock(&bdev->bd_mutex);
+        return error;
+}
+static struct block_device_operations brd_fops = {
+        .owner =        THIS_MODULE,
+        .ioctl =        brd_ioctl,
+};
+/*
+ * And now the modules code and kernel interface.
+ */
+static int rd_nr;
+int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
+module_param(rd_nr, int, 0);
+MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
+module_param(rd_size, int, 0);
+MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
+#ifndef MODULE
+/* Legacy boot options - nonmodular */
+static int __init ramdisk_size(char *str)
+{
+        rd_size = simple_strtol(str, NULL, 0);
+        return 1;
+}
+static int __init ramdisk_size2(char *str)
+{
+        return ramdisk_size(str);
+}
+__setup("ramdisk=", ramdisk_size);
+__setup("ramdisk_size=", ramdisk_size2);
+#endif
+/*
+ * The device scheme is derived from loop.c. Keep them in synch where possible
+ * (should share code eventually).
+ */
+static LIST_HEAD(brd_devices);
+static DEFINE_MUTEX(brd_devices_mutex);
+static struct brd_device *brd_alloc(int i)
+{
+        struct brd_device *brd;
+        struct gendisk *disk;
+        brd = kzalloc(sizeof(*brd), GFP_KERNEL);
+        if (!brd)
+                goto out;
+        brd->brd_number         = i;
+        spin_lock_init(&brd->brd_lock);
+        INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
+        brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
+        if (!brd->brd_queue)
+                goto out_free_dev;
+        blk_queue_make_request(brd->brd_queue, brd_make_request);
+        blk_queue_max_sectors(brd->brd_queue, 1024);
+        blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
+        disk = brd->brd_disk = alloc_disk(1);
+        if (!disk)
+                goto out_free_queue;
+        disk->major             = RAMDISK_MAJOR;
+        disk->first_minor       = i;
+        disk->fops              = &brd_fops;
+        disk->private_data      = brd;
+        disk->queue             = brd->brd_queue;
+        sprintf(disk->disk_name, "ram%d", i);
+        set_capacity(disk, rd_size * 2);
+        return brd;
+out_free_queue:
+        blk_cleanup_queue(brd->brd_queue);
+out_free_dev:
+        kfree(brd);
+out:
+        return NULL;
+}
+static void brd_free(struct brd_device *brd)
+{
+        put_disk(brd->brd_disk);
+        blk_cleanup_queue(brd->brd_queue);
+        brd_free_pages(brd);
+        kfree(brd);
+}
+static struct brd_device *brd_init_one(int i)
+{
+        struct brd_device *brd;
+        list_for_each_entry(brd, &brd_devices, brd_list) {
+                if (brd->brd_number == i)
+                        goto out;
+        }
+        brd = brd_alloc(i);
+        if (brd) {
+                add_disk(brd->brd_disk);
+                list_add_tail(&brd->brd_list, &brd_devices);
+        }
+out:
+        return brd;
+}
+static void brd_del_one(struct brd_device *brd)
+{
+        list_del(&brd->brd_list);
+        del_gendisk(brd->brd_disk);
+        brd_free(brd);
+}
+static struct kobject *brd_probe(dev_t dev, int *part, void *data)
+{
+        struct brd_device *brd;
+        struct kobject *kobj;
+        mutex_lock(&brd_devices_mutex);
+        brd = brd_init_one(dev & MINORMASK);
+        kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM);
+        mutex_unlock(&brd_devices_mutex);
+        *part = 0;
+        return kobj;
+}
+static int __init brd_init(void)
+{
+        int i, nr;
+        unsigned long range;
+        struct brd_device *brd, *next;
+        /*
+         * brd module now has a feature to instantiate underlying device
+         * structure on-demand, provided that there is an access dev node.
+         * However, this will not work well with user space tool that doesn't
+         * know about such "feature".  In order to not break any existing
+         * tool, we do the following:
+         *
+         * (1) if rd_nr is specified, create that many upfront, and this
+         *     also becomes a hard limit.
+         * (2) if rd_nr is not specified, create 1 rd device on module
+         *     load, user can further extend brd device by create dev node
+         *     themselves and have kernel automatically instantiate actual
+         *     device on-demand.
+         */
+        if (rd_nr > 1UL << MINORBITS)
+                return -EINVAL;
+        if (rd_nr) {
+                nr = rd_nr;
+                range = rd_nr;
+        } else {
+                nr = CONFIG_BLK_DEV_RAM_COUNT;
+                range = 1UL << MINORBITS;
+        }
+        if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
+                return -EIO;
+        for (i = 0; i < nr; i++) {
+                brd = brd_alloc(i);
+                if (!brd)
+                        goto out_free;
+                list_add_tail(&brd->brd_list, &brd_devices);
+        }
+        /* point of no return */
+        list_for_each_entry(brd, &brd_devices, brd_list)
+                add_disk(brd->brd_disk);
+        blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range,
+                                  THIS_MODULE, brd_probe, NULL, NULL);
+        printk(KERN_INFO "brd: module loaded\n");
+        return 0;
+out_free:
+        list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
+                list_del(&brd->brd_list);
+                brd_free(brd);
+        }
+        unregister_blkdev(RAMDISK_MAJOR, "brd");
+        return -ENOMEM;
+}
+static void __exit brd_exit(void)
+{
+        unsigned long range;
+        struct brd_device *brd, *next;
+        range = rd_nr ? rd_nr :  1UL << MINORBITS;
+        list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
+                brd_del_one(brd);
+        blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range);
+        unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+}
+module_init(brd_init);
+module_exit(brd_exit);
author	Nick Piggin <npiggin@suse.de>	2008-02-08 07:19:49 -0500
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2008-02-08 12:22:30 -0500
commit	9db5579be4bb5320c3248f6acf807aedf05ae143 (patch)
tree	fde09bbeb427946b30d2e0fb6b00494a42488052 /drivers/block/brd.c
parent	daeb51e62cacde31c8245866e1096ff79a0c83fe (diff)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c new file mode 100644 index 000000000000..50b659bedc8f --- /dev/null +++ b/drivers/block/brd.c
@@ -0,0 +1,548 @@
	1	/*
	2	* Ram backed block device driver.
	3	*
	4	* Copyright (C) 2007 Nick Piggin
	5	* Copyright (C) 2007 Novell Inc.
	6	*
	7	* Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
	8	* of their respective owners.
	9	*/
	10
	11	#include <linux/init.h>
	12	#include <linux/module.h>
	13	#include <linux/moduleparam.h>
	14	#include <linux/major.h>
	15	#include <linux/blkdev.h>
	16	#include <linux/bio.h>
	17	#include <linux/highmem.h>
	18	#include <linux/gfp.h>
	19	#include <linux/radix-tree.h>
	20	#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
	21
	22	#include <asm/uaccess.h>
	23
	24	#define SECTOR_SHIFT 9
	25	#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
	26	#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT)
	27
	28	/*
	29	* Each block ramdisk device has a radix_tree brd_pages of pages that stores
	30	* the pages containing the block device's contents. A brd page's ->index is
	31	* its offset in PAGE_SIZE units. This is similar to, but in no way connected
	32	* with, the kernel's pagecache or buffer cache (which sit above our block
	33	* device).
	34	*/
	35	struct brd_device {
	36	int brd_number;
	37	int brd_refcnt;
	38	loff_t brd_offset;
	39	loff_t brd_sizelimit;
	40	unsigned brd_blocksize;
	41
	42	struct request_queue *brd_queue;
	43	struct gendisk *brd_disk;
	44	struct list_head brd_list;
	45
	46	/*
	47	* Backing store of pages and lock to protect it. This is the contents
	48	* of the block device.
	49	*/
	50	spinlock_t brd_lock;
	51	struct radix_tree_root brd_pages;
	52	};
	53
	54	/*
	55	* Look up and return a brd's page for a given sector.
	56	*/
	57	static struct page brd_lookup_page(struct brd_device brd, sector_t sector)
	58	{
	59	pgoff_t idx;
	60	struct page *page;
	61
	62	/*
	63	* The page lifetime is protected by the fact that we have opened the
	64	* device node -- brd pages will never be deleted under us, so we
	65	* don't need any further locking or refcounting.
	66	*
	67	* This is strictly true for the radix-tree nodes as well (ie. we
	68	* don't actually need the rcu_read_lock()), however that is not a
	69	* documented feature of the radix-tree API so it is better to be
	70	* safe here (we don't have total exclusion from radix tree updates
	71	* here, only deletes).
	72	*/
	73	rcu_read_lock();
	74	idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
	75	page = radix_tree_lookup(&brd->brd_pages, idx);
	76	rcu_read_unlock();
	77
	78	BUG_ON(page && page->index != idx);
	79
	80	return page;
	81	}
	82
	83	/*
	84	* Look up and return a brd's page for a given sector.
	85	* If one does not exist, allocate an empty page, and insert that. Then
	86	* return it.
	87	*/
	88	static struct page brd_insert_page(struct brd_device brd, sector_t sector)
	89	{
	90	pgoff_t idx;
	91	struct page *page;
	92
	93	page = brd_lookup_page(brd, sector);
	94	if (page)
	95	return page;
	96
	97	/*
	98	* Must use NOIO because we don't want to recurse back into the
	99	* block or filesystem layers from page reclaim.
	100	*/
	101	page = alloc_page(GFP_NOIO \| __GFP_HIGHMEM \| __GFP_ZERO);
	102	if (!page)
	103	return NULL;
	104
	105	if (radix_tree_preload(GFP_NOIO)) {
	106	__free_page(page);
	107	return NULL;
	108	}
	109
	110	spin_lock(&brd->brd_lock);
	111	idx = sector >> PAGE_SECTORS_SHIFT;
	112	if (radix_tree_insert(&brd->brd_pages, idx, page)) {
	113	__free_page(page);
	114	page = radix_tree_lookup(&brd->brd_pages, idx);
	115	BUG_ON(!page);
	116	BUG_ON(page->index != idx);
	117	} else
	118	page->index = idx;
	119	spin_unlock(&brd->brd_lock);
	120
	121	radix_tree_preload_end();
	122
	123	return page;
	124	}
	125
	126	/*
	127	* Free all backing store pages and radix tree. This must only be called when
	128	* there are no other users of the device.
	129	*/
	130	#define FREE_BATCH 16
	131	static void brd_free_pages(struct brd_device *brd)
	132	{
	133	unsigned long pos = 0;
	134	struct page *pages[FREE_BATCH];
	135	int nr_pages;
	136
	137	do {
	138	int i;
	139
	140	nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
	141	(void **)pages, pos, FREE_BATCH);
	142
	143	for (i = 0; i < nr_pages; i++) {
	144	void *ret;
	145
	146	BUG_ON(pages[i]->index < pos);
	147	pos = pages[i]->index;
	148	ret = radix_tree_delete(&brd->brd_pages, pos);
	149	BUG_ON(!ret \|\| ret != pages[i]);
	150	__free_page(pages[i]);
	151	}
	152
	153	pos++;
	154
	155	/*
	156	* This assumes radix_tree_gang_lookup always returns as
	157	* many pages as possible. If the radix-tree code changes,
	158	* so will this have to.
	159	*/
	160	} while (nr_pages == FREE_BATCH);
	161	}
	162
	163	/*
	164	* copy_to_brd_setup must be called before copy_to_brd. It may sleep.
	165	*/
	166	static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
	167	{
	168	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
	169	size_t copy;
	170
	171	copy = min_t(size_t, n, PAGE_SIZE - offset);
	172	if (!brd_insert_page(brd, sector))
	173	return -ENOMEM;
	174	if (copy < n) {
	175	sector += copy >> SECTOR_SHIFT;
	176	if (!brd_insert_page(brd, sector))
	177	return -ENOMEM;
	178	}
	179	return 0;
	180	}
	181
	182	/*
	183	* Copy n bytes from src to the brd starting at sector. Does not sleep.
	184	*/
	185	static void copy_to_brd(struct brd_device brd, const void src,
	186	sector_t sector, size_t n)
	187	{
	188	struct page *page;
	189	void *dst;
	190	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
	191	size_t copy;
	192
	193	copy = min_t(size_t, n, PAGE_SIZE - offset);
	194	page = brd_lookup_page(brd, sector);
	195	BUG_ON(!page);
	196
	197	dst = kmap_atomic(page, KM_USER1);
	198	memcpy(dst + offset, src, copy);
	199	kunmap_atomic(dst, KM_USER1);
	200
	201	if (copy < n) {
	202	src += copy;
	203	sector += copy >> SECTOR_SHIFT;
	204	copy = n - copy;
	205	page = brd_lookup_page(brd, sector);
	206	BUG_ON(!page);
	207
	208	dst = kmap_atomic(page, KM_USER1);
	209	memcpy(dst, src, copy);
	210	kunmap_atomic(dst, KM_USER1);
	211	}
	212	}
	213
	214	/*
	215	* Copy n bytes to dst from the brd starting at sector. Does not sleep.
	216	*/
	217	static void copy_from_brd(void dst, struct brd_device brd,
	218	sector_t sector, size_t n)
	219	{
	220	struct page *page;
	221	void *src;
	222	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
	223	size_t copy;
	224
	225	copy = min_t(size_t, n, PAGE_SIZE - offset);
	226	page = brd_lookup_page(brd, sector);
	227	if (page) {
	228	src = kmap_atomic(page, KM_USER1);
	229	memcpy(dst, src + offset, copy);
	230	kunmap_atomic(src, KM_USER1);
	231	} else
	232	memset(dst, 0, copy);
	233
	234	if (copy < n) {
	235	dst += copy;
	236	sector += copy >> SECTOR_SHIFT;
	237	copy = n - copy;
	238	page = brd_lookup_page(brd, sector);
	239	if (page) {
	240	src = kmap_atomic(page, KM_USER1);
	241	memcpy(dst, src, copy);
	242	kunmap_atomic(src, KM_USER1);
	243	} else
	244	memset(dst, 0, copy);
	245	}
	246	}
	247
	248	/*
	249	* Process a single bvec of a bio.
	250	*/
	251	static int brd_do_bvec(struct brd_device brd, struct page page,
	252	unsigned int len, unsigned int off, int rw,
	253	sector_t sector)
	254	{
	255	void *mem;
	256	int err = 0;
	257
	258	if (rw != READ) {
	259	err = copy_to_brd_setup(brd, sector, len);
	260	if (err)
	261	goto out;
	262	}
	263
	264	mem = kmap_atomic(page, KM_USER0);
	265	if (rw == READ) {
	266	copy_from_brd(mem + off, brd, sector, len);
	267	flush_dcache_page(page);
	268	} else
	269	copy_to_brd(brd, mem + off, sector, len);
	270	kunmap_atomic(mem, KM_USER0);
	271
	272	out:
	273	return err;
	274	}
	275
	276	static int brd_make_request(struct request_queue q, struct bio bio)
	277	{
	278	struct block_device *bdev = bio->bi_bdev;
	279	struct brd_device *brd = bdev->bd_disk->private_data;
	280	int rw;
	281	struct bio_vec *bvec;
	282	sector_t sector;
	283	int i;
	284	int err = -EIO;
	285
	286	sector = bio->bi_sector;
	287	if (sector + (bio->bi_size >> SECTOR_SHIFT) >
	288	get_capacity(bdev->bd_disk))
	289	goto out;
	290
	291	rw = bio_rw(bio);
	292	if (rw == READA)
	293	rw = READ;
	294
	295	bio_for_each_segment(bvec, bio, i) {
	296	unsigned int len = bvec->bv_len;
	297	err = brd_do_bvec(brd, bvec->bv_page, len,
	298	bvec->bv_offset, rw, sector);
	299	if (err)
	300	break;
	301	sector += len >> SECTOR_SHIFT;
	302	}
	303
	304	out:
	305	bio_endio(bio, err);
	306
	307	return 0;
	308	}
	309
	310	static int brd_ioctl(struct inode inode, struct file file,
	311	unsigned int cmd, unsigned long arg)
	312	{
	313	int error;
	314	struct block_device *bdev = inode->i_bdev;
	315	struct brd_device *brd = bdev->bd_disk->private_data;
	316
	317	if (cmd != BLKFLSBUF)
	318	return -ENOTTY;
	319
	320	/*
	321	* ram device BLKFLSBUF has special semantics, we want to actually
	322	* release and destroy the ramdisk data.
	323	*/
	324	mutex_lock(&bdev->bd_mutex);
	325	error = -EBUSY;
	326	if (bdev->bd_openers <= 1) {
	327	/*
	328	* Invalidate the cache first, so it isn't written
	329	* back to the device.
	330	*
	331	* Another thread might instantiate more buffercache here,
	332	* but there is not much we can do to close that race.
	333	*/
	334	invalidate_bh_lrus();
	335	truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
	336	brd_free_pages(brd);
	337	error = 0;
	338	}
	339	mutex_unlock(&bdev->bd_mutex);
	340
	341	return error;
	342	}
	343
	344	static struct block_device_operations brd_fops = {
	345	.owner = THIS_MODULE,
	346	.ioctl = brd_ioctl,
	347	};
	348
	349	/*
	350	* And now the modules code and kernel interface.
	351	*/
	352	static int rd_nr;
	353	int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
	354	module_param(rd_nr, int, 0);
	355	MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
	356	module_param(rd_size, int, 0);
	357	MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
	358	MODULE_LICENSE("GPL");
	359	MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
	360
	361	#ifndef MODULE
	362	/* Legacy boot options - nonmodular */
	363	static int __init ramdisk_size(char *str)
	364	{
	365	rd_size = simple_strtol(str, NULL, 0);
	366	return 1;
	367	}
	368	static int __init ramdisk_size2(char *str)
	369	{
	370	return ramdisk_size(str);
	371	}
	372	__setup("ramdisk=", ramdisk_size);
	373	__setup("ramdisk_size=", ramdisk_size2);
	374	#endif
	375
	376	/*
	377	* The device scheme is derived from loop.c. Keep them in synch where possible
	378	* (should share code eventually).
	379	*/
	380	static LIST_HEAD(brd_devices);
	381	static DEFINE_MUTEX(brd_devices_mutex);
	382
	383	static struct brd_device *brd_alloc(int i)
	384	{
	385	struct brd_device *brd;
	386	struct gendisk *disk;
	387
	388	brd = kzalloc(sizeof(*brd), GFP_KERNEL);
	389	if (!brd)
	390	goto out;
	391	brd->brd_number = i;
	392	spin_lock_init(&brd->brd_lock);
	393	INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
	394
	395	brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
	396	if (!brd->brd_queue)
	397	goto out_free_dev;
	398	blk_queue_make_request(brd->brd_queue, brd_make_request);
	399	blk_queue_max_sectors(brd->brd_queue, 1024);
	400	blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
	401
	402	disk = brd->brd_disk = alloc_disk(1);
	403	if (!disk)
	404	goto out_free_queue;
	405	disk->major = RAMDISK_MAJOR;
	406	disk->first_minor = i;
	407	disk->fops = &brd_fops;
	408	disk->private_data = brd;
	409	disk->queue = brd->brd_queue;
	410	sprintf(disk->disk_name, "ram%d", i);
	411	set_capacity(disk, rd_size * 2);
	412
	413	return brd;
	414
	415	out_free_queue:
	416	blk_cleanup_queue(brd->brd_queue);
	417	out_free_dev:
	418	kfree(brd);
	419	out:
	420	return NULL;
	421	}
	422
	423	static void brd_free(struct brd_device *brd)
	424	{
	425	put_disk(brd->brd_disk);
	426	blk_cleanup_queue(brd->brd_queue);
	427	brd_free_pages(brd);
	428	kfree(brd);
	429	}
	430
	431	static struct brd_device *brd_init_one(int i)
	432	{
	433	struct brd_device *brd;
	434
	435	list_for_each_entry(brd, &brd_devices, brd_list) {
	436	if (brd->brd_number == i)
	437	goto out;
	438	}
	439
	440	brd = brd_alloc(i);
	441	if (brd) {
	442	add_disk(brd->brd_disk);
	443	list_add_tail(&brd->brd_list, &brd_devices);
	444	}
	445	out:
	446	return brd;
	447	}
	448
	449	static void brd_del_one(struct brd_device *brd)
	450	{
	451	list_del(&brd->brd_list);
	452	del_gendisk(brd->brd_disk);
	453	brd_free(brd);
	454	}
	455
	456	static struct kobject brd_probe(dev_t dev, int part, void *data)
	457	{
	458	struct brd_device *brd;
	459	struct kobject *kobj;
	460
	461	mutex_lock(&brd_devices_mutex);
	462	brd = brd_init_one(dev & MINORMASK);
	463	kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM);
	464	mutex_unlock(&brd_devices_mutex);
	465
	466	*part = 0;
	467	return kobj;
	468	}
	469
	470	static int __init brd_init(void)
	471	{
	472	int i, nr;
	473	unsigned long range;
	474	struct brd_device brd, next;
	475
	476	/*
	477	* brd module now has a feature to instantiate underlying device
	478	* structure on-demand, provided that there is an access dev node.
	479	* However, this will not work well with user space tool that doesn't
	480	* know about such "feature". In order to not break any existing
	481	* tool, we do the following:
	482	*
	483	* (1) if rd_nr is specified, create that many upfront, and this
	484	* also becomes a hard limit.
	485	* (2) if rd_nr is not specified, create 1 rd device on module
	486	* load, user can further extend brd device by create dev node
	487	* themselves and have kernel automatically instantiate actual
	488	* device on-demand.
	489	*/
	490	if (rd_nr > 1UL << MINORBITS)
	491	return -EINVAL;
	492
	493	if (rd_nr) {
	494	nr = rd_nr;
	495	range = rd_nr;
	496	} else {
	497	nr = CONFIG_BLK_DEV_RAM_COUNT;
	498	range = 1UL << MINORBITS;
	499	}
	500
	501	if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
	502	return -EIO;
	503
	504	for (i = 0; i < nr; i++) {
	505	brd = brd_alloc(i);
	506	if (!brd)
	507	goto out_free;
	508	list_add_tail(&brd->brd_list, &brd_devices);
	509	}
	510
	511	/* point of no return */
	512
	513	list_for_each_entry(brd, &brd_devices, brd_list)
	514	add_disk(brd->brd_disk);
	515
	516	blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range,
	517	THIS_MODULE, brd_probe, NULL, NULL);
	518
	519	printk(KERN_INFO "brd: module loaded\n");
	520	return 0;
	521
	522	out_free:
	523	list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
	524	list_del(&brd->brd_list);
	525	brd_free(brd);
	526	}
	527
	528	unregister_blkdev(RAMDISK_MAJOR, "brd");
	529	return -ENOMEM;
	530	}
	531
	532	static void __exit brd_exit(void)
	533	{
	534	unsigned long range;
	535	struct brd_device brd, next;
	536
	537	range = rd_nr ? rd_nr : 1UL << MINORBITS;
	538
	539	list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
	540	brd_del_one(brd);
	541
	542	blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range);
	543	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
	544	}
	545
	546	module_init(brd_init);
	547	module_exit(brd_exit);
	548