/*
* linux/fs/block_dev.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
*/
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/major.h>
#include <linux/smp_lock.h>
#include <linux/device_cgroup.h>
#include <linux/highmem.h>
#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
#include <linux/buffer_head.h>
#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/mpage.h>
#include <linux/mount.h>
#include <linux/uio.h>
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/kmemleak.h>
#include <asm/uaccess.h>
#include "internal.h"
struct bdev_inode {
struct block_device bdev;
struct inode vfs_inode;
};
static const struct address_space_operations def_blk_aops;
static inline struct bdev_inode *BDEV_I(struct inode *inode)
{
return container_of(inode, struct bdev_inode, vfs_inode);
}
inline struct block_device *I_BDEV(struct inode *inode)
{
return &BDEV_I(inode)->bdev;
}
EXPORT_SYMBOL(I_BDEV);
static sector_t max_block(struct block_device *bdev)
{
sector_t retval = ~((sector_t)0);
loff_t sz = i_size_read(bdev->bd_inode);
if (sz) {
unsigned int size = block_size(bdev);
unsigned int sizebits = blksize_bits(size);
retval = (sz >> sizebits);
}
return retval;
}
/* Kill _all_ buffers and pagecache , dirty or not.. */
static void kill_bdev(struct block_device *bdev)
{
if (bdev->bd_inode->i_mapping->nrpages == 0)
return;
invalidate_bh_lrus();
truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
}
int set_blocksize(struct block_device *bdev, int size)
{
/* Size must be a power of two, and between 512 and PAGE_SIZE */
if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
return -EINVAL;
/* Size cannot be smaller than the size supported by the device */
if (size < bdev_logical_block_size(bdev))
return -EINVAL;
/* Don't change the size if it is same as current */
if (bdev->bd_block_size != size) {
sync_blockdev(bdev);
bdev->bd_block_size = size;
bdev->bd_inode->i_blkbits = blksize_bits(size);
kill_bdev(bdev);
}
return 0;
}
EXPORT_SYMBOL(set_blocksize);
int sb_set_blocksize(struct super_block *sb, int size)
{
if (set_blocksize(sb->s_bdev, size))
return 0;
/* If we get here, we know size is power of two
* and it's value is between 512 and PAGE_SIZE */
sb->s_blocksize = size;
sb->s_blocksize_bits = blksize_bits(size);
return sb->s_blocksize;
}
EXPORT_SYMBOL(sb_set_blocksize);
int sb_min_blocksize(struct super_block *sb, int size)
{
int minsize = bdev_logical_block_size(sb->s_bdev);
if (size < minsize)
size = minsize;
return sb_set_blocksize(sb, size);
}
EXPORT_SYMBOL(sb_min_blocksize);
static int
blkdev_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
if (iblock >= max_block(I_BDEV(inode))) {
if (create)
return -EIO;
/*
* for reads, we're just trying to fill a partial page.
* return a hole, they will have to call get_block again
* before they can fill it, and they will get -EIO at that
* time
*/
return 0;
}
bh->b_bdev = I_BDEV(inode);
bh->b_blocknr = iblock;
set_buffer_mapped(bh);
return 0;
}
static int
blkdev_get_blocks(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
sector_t end_block = max_block(I_BDEV(inode));
unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
if ((iblock + max_blocks) > end_block) {
max_blocks = end_block - iblock;
if ((long)max_blocks <= 0) {
if (create)
return -EIO; /* write fully beyond EOF */
/*
* It is a read which is fully beyond EOF. We return
* a !buffer_mapped buffer
*/
max_blocks = 0;
}
}
bh->b_bdev = I_BDEV(inode);
bh->b_blocknr = iblock;
bh->b_size = max_blocks << inode->i_blkbits;
if (max_blocks)
set_buffer_mapped(bh);
return 0;
}
static ssize_t
blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode),
iov, offset, nr_segs, blkdev_get_blocks, NULL);
}
int __sync_blockdev(struct block_device *bdev, int wait)
{
if (!bdev)
return 0;
if (!wait)
return filemap_flush(bdev->bd_inode->i_mapping);
return filemap_write_and_wait(bdev->bd_inode->i_mapping);
}
/*
* Write out and wait upon all the dirty data associated with a block
* device via its mapping. Does not take the superblock lock.
*/
int sync_blockdev(struct block_device *bdev)
{
return __sync_blockdev(bdev, 1);
}
EXPORT_SYMBOL(sync_blockdev);
/*
* Write out and wait upon all dirty data associated with this
* device. Filesystem data as well as the underlying block
* device. Takes the superblock lock.
*/
int fsync_bdev(struct block_device *bdev)
{
struct super_block *sb = get_super(bdev);
if (sb) {
int res = sync_filesystem(sb);
drop_super(sb);
return res;
}
return sync_blockdev(bdev);
}
EXPORT_SYMBOL(fsync_bdev);
/**
* freeze_bdev -- lock a filesystem and force it into a consistent state
* @bdev: blockdevice to lock
*
* If a superblock is found on this device, we take the s_umount semaphore
* on it to make sure nobody unmounts until the snapshot creation is done.
* The reference counter (bd_fsfreeze_count) guarantees that only the last
* unfreeze process can unfreeze the frozen filesystem actually when multiple
* freeze requests arrive simultaneously. It counts up in freeze_bdev() and
* count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
* actually.
*/
struct super_block *freeze_bdev(struct block_device *bdev)
{
struct super_block *sb;
int error = 0;
mutex_lock(&bdev->bd_fsfreeze_mutex);
if (++bdev->bd_fsfreeze_count > 1) {
/*
* We don't even need to grab a reference - the first call
* to freeze_bdev grab an active reference and only the last
* thaw_bdev drops it.
*/
sb = get_super(bdev);
drop_super(sb);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return sb;
}
sb = get_active_super(bdev);
if (!sb)
goto out;
if (sb->s_flags & MS_RDONLY) {
sb->s_frozen = SB_FREEZE_TRANS;
up_write(&sb->s_umount);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return sb;
}
sb->s_frozen = SB_FREEZE_WRITE;
smp_wmb();
sync_filesystem(sb);
sb->s_frozen = SB_FREEZE_TRANS;
smp_wmb();
sync_blockdev(sb->s_bdev);
if (sb->s_op->freeze_fs) {
error = sb->s_op->freeze_fs(sb);
if (error) {
printk(KERN_ERR
"VFS:Filesystem freeze failed\n");
sb->s_frozen = SB_UNFROZEN;
deactivate_locked_super(sb);
bdev->bd_fsfreeze_count--;
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return ERR_PTR(error);
}
}
up_write(&sb->s_umount);
out:
sync_blockdev(bdev);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return sb; /* thaw_bdev releases s->s_umount */
}
EXPORT_SYMBOL(freeze_bdev);
/**
* thaw_bdev -- unlock filesystem
* @bdev: blockdevice to unlock
* @sb: associated superblock
*
* Unlocks the filesystem and marks it writeable again after freeze_bdev().
*/
int thaw_bdev(struct block_device *bdev, struct super_block *sb)
{
int error = -EINVAL;
mutex_lock(&bdev->bd_fsfreeze_mutex);
if (!bdev->bd_fsfreeze_count)
goto out_unlock;
error = 0;
if (--bdev->bd_fsfreeze_count > 0)
goto out_unlock;
if (!sb)
goto out_unlock;
BUG_ON(sb->s_bdev != bdev);
down_write(&sb->s_umount);
if (sb->s_flags & MS_RDONLY)
goto out_unfrozen;
if (sb->s_op->unfreeze_fs) {
error = sb->s_op->unfreeze_fs(sb);
if (error) {
printk(KERN_ERR
"VFS:Filesystem thaw failed\n");
sb->s_frozen = SB_FREEZE_TRANS;
bdev->bd_fsfreeze_count++;
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return error;
}
}
out_unfrozen:
sb->s_frozen = SB_UNFROZEN;
smp_wmb();
wake_up(&sb->s_wait_unfrozen);
if (sb)
deactivate_locked_super(sb);
out_unlock:
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return 0;
}
EXPORT_SYMBOL(thaw_bdev);
static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, blkdev_get_block, wbc);
}
static int blkdev_readpage(struct file * file, struct page * page)
{
return block_read_full_page(page, blkdev_get_block);
}
static int blkdev_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
*pagep = NULL;
return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
blkdev_get_block);
}
static int blkdev_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
int ret;
ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
unlock_page(page);
page_cache_release(page);
return ret;
}
/*
* private llseek:
* for a block special file file->f_path.dentry->d_inode->i_size is zero
* so we compute the size by hand (just as in block_read/write above)
*/
static loff_t block_llseek(struct file *file, loff_t offset, int origin)
{
struct inode *bd_inode = file->f_mapping->host;
loff_t size;
loff_t retval;
mutex_lock(&bd_inode->i_mutex);
size = i_size_read(bd_inode);
switch (origin) {
case 2:
offset += size;
break;
case 1:
offset += file->f_pos;
}
retval = -EINVAL;
if (offset >= 0 && offset <= size) {
if (offset != file->f_pos) {
file->f_pos = offset;
}
retval = offset;
}
mutex_unlock(&bd_inode->i_mutex);
return retval;
}
/*
* Filp is never NULL; the only case when ->fsync() is called with
* NULL first argument is nfsd_sync_dir() and that's not a directory.
*/
static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
{
struct block_device *bdev = I_BDEV(filp->f_mapping->host);
int error;
error = sync_blockdev(bdev);
if (error)
return error;
error = blkdev_issue_flush(bdev, NULL);
if (error == -EOPNOTSUPP)
error = 0;
return error;
}
/*
* pseudo-fs
*/
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
static struct kmem_cache * bdev_cachep __read_mostly;
static struct inode *bdev_alloc_inode(struct super_block *sb)
{
struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
}
static void bdev_destroy_inode(struct inode *inode)
{
struct bdev_inode *bdi = BDEV_I(inode);
kmem_cache_free(bdev_cachep, bdi);
}
static void init_once(void *foo)
{
struct bdev_inode *ei = (struct bdev_inode *) foo;
struct block_device *bdev = &ei->bdev;
memset(bdev, 0, sizeof(*bdev));
mutex_init(&bdev->bd_mutex);
INIT_LIST_HEAD(&bdev->bd_inodes);
INIT_LIST_HEAD(&bdev->bd_list);
#ifdef CONFIG_SYSFS
INIT_LIST_HEAD(&bdev->bd_holder_list);
#endif
inode_init_once(&ei->vfs_inode);
/* Initialize mutex for freeze. */
mutex_init(&bdev->bd_fsfreeze_mutex);
}
static inline void __bd_forget(struct inode *inode)
{
list_del_init(&inode->i_devices);
inode->i_bdev = NULL;
inode->i_mapping = &inode->i_data;
}
static void bdev_clear_inode(struct inode *inode)
{
struct block_device *bdev = &BDEV_I(inode)->bdev;
struct list_head *p;
spin_lock(&bdev_lock);
while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
__bd_forget(list_entry(p, struct inode, i_devices));
}
list_del_init(&bdev->bd_list);
spin_unlock(&bdev_lock);
}
static const struct super_operations bdev_sops = {
.statfs = simple_statfs,
.alloc_inode = bdev_alloc_inode,
.destroy_inode = bdev_destroy_inode,
.drop_inode = generic_delete_inode,
.clear_inode = bdev_clear_inode,
};
static int bd_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data, struct vfsmount *mnt)
{
return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt);
}
static struct file_system_type bd_type = {
.name = "bdev",
.get_sb = bd_get_sb,
.kill_sb = kill_anon_super,
};
struct super_block *blockdev_superblock __read_mostly;
void __init bdev_cache_init(void)
{
int err;
struct vfsmount *bd_mnt;
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD|SLAB_PANIC),
init_once);
err = register_filesystem(&bd_type);
if (err)
panic("Cannot register bdev pseudo-fs");
bd_mnt = kern_mount(&bd_type);
if (IS_ERR(bd_mnt))
panic("Cannot create bdev pseudo-fs");
/*
* This vfsmount structure is only used to obtain the
* blockdev_superblock, so tell kmemleak not to report it.
*/
kmemleak_not_leak(bd_mnt);
blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
}
/*
* Most likely _very_ bad one - but then it's hardly critical for small
* /dev and can be fixed when somebody will need really large one.
* Keep in mind that it will be fed through icache hash function too.
*/
static inline unsigned long hash(dev_t dev)
{
return MAJOR(dev)+MINOR(dev);
}
static int bdev_test(struct inode *inode, void *data)
{
return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
}
static int bdev_set(struct inode *inode, void *data)
{
BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
return 0;
}
static LIST_HEAD(all_bdevs);
struct block_device *bdget(dev_t dev)
{
struct block_device *bdev;
struct inode *inode;
inode = iget5_locked(blockdev_superblock, hash(dev),
bdev_test, bdev_set, &dev);
if (!inode)
return NULL;
bdev = &BDEV_I(inode)->bdev;
if (inode->i_state & I_NEW) {
bdev->bd_contains = NULL;
bdev->bd_inode = inode;
bdev->bd_block_size = (1 << inode->i_blkbits);
bdev->bd_part_count = 0;
bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
inode->i_rdev = dev;
inode->i_bdev = bdev;
inode->i_data.a_ops = &def_blk_aops;
mapping_set_gfp_mask(&inode->i_data, GFP_USER);
inode->i_data.backing_dev_info = &default_backing_dev_info;
spin_lock(&bdev_lock);
list_add(&bdev->bd_list, &all_bdevs);
spin_unlock(&bdev_lock);
unlock_new_inode(inode);
}
return bdev;
}
EXPORT_SYMBOL(bdget);
/**
* bdgrab -- Grab a reference to an already referenced block device
* @bdev: Block device to grab a reference to.
*/
struct block_device *bdgrab(struct block_device *bdev)
{
atomic_inc(&bdev->bd_inode->i_count);
return bdev;
}
long nr_blockdev_pages(void)
{
struct block_device *bdev;
long ret = 0;
spin_lock(&bdev_lock);
list_for_each_entry(bdev, &all_bdevs, bd_list) {
ret += bdev->bd_inode->i_mapping->nrpages;
}
spin_unlock(&bdev_lock);
return ret;
}
void bdput(struct block_device *bdev)
{
iput(bdev->bd_inode);
}
EXPORT_SYMBOL(bdput);
static struct block_device *bd_acquire(struct inode *inode)
{
struct block_device *bdev;
spin_lock(&bdev_lock);
bdev = inode->i_bdev;
if (bdev) {
atomic_inc(&bdev->bd_inode->i_count);
spin_unlock(&bdev_lock);
return bdev;
}
spin_unlock(&bdev_lock);
bdev = bdget(inode->i_rdev);
if (bdev) {
spin_lock(&bdev_lock);
if (!inode->i_bdev) {
/*
* We take an additional bd_inode->i_count for inode,
* and it's released in clear_inode() of inode.
* So, we can access it via ->i_mapping always
* without igrab().
*/
atomic_inc(&bdev->bd_inode->i_count);
inode->i_bdev = bdev;
inode->i_mapping = bdev->bd_inode->i_mapping;
list_add(&inode->i_devices, &bdev->bd_inodes);
}
spin_unlock(&bdev_lock);
}
return bdev;
}
/* Call when you free inode */
void bd_forget(struct inode *inode)
{
struct block_device *bdev = NULL;
spin_lock(&bdev_lock);
if (inode->i_bdev) {
if (!sb_is_blkdev_sb(inode->i_sb))
bdev = inode->i_bdev;
__bd_forget(inode);
}
spin_unlock(&bdev_lock);
if (bdev)
iput(bdev->bd_inode);
}
int bd_claim(struct block_device *bdev, void *holder)
{
int res;
spin_lock(&bdev_lock);
/* first decide result */
if (bdev->bd_holder == holder)
res = 0; /* already a holder */
else if (bdev->bd_holder != NULL)
res = -EBUSY; /* held by someone else */
else if (bdev->bd_contains == bdev)
res = 0; /* is a whole device which isn't held */
else if (bdev->bd_contains->bd_holder == bd_claim)
res = 0; /* is a partition of a device that is being partitioned */
else if (bdev->bd_contains->bd_holder != NULL)
res = -EBUSY; /* is a partition of a held device */
else
res = 0; /* is a partition of an un-held device */
/* now impose change */
if (res==0) {
/* note that for a whole device bd_holders
* will be incremented twice, and bd_holder will
* be set to bd_claim before being set to holder
*/
bdev->bd_contains->bd_holders ++;
bdev->bd_contains->bd_holder = bd_claim;
bdev->bd_holders++;
bdev->bd_holder = holder;
}
spin_unlock(&bdev_lock);
return res;
}
EXPORT_SYMBOL(bd_claim);
void bd_release(struct block_device *bdev)
{
spin_lock(&bdev_lock);
if (!--bdev->bd_contains->bd_holders)