aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorMikulas Patocka <mpatocka@redhat.com>2012-09-26 01:46:40 -0400
committerJens Axboe <axboe@kernel.dk>2012-09-26 01:46:40 -0400
commitb87570f5d349661814b262dd5fc40787700f80d6 (patch)
treed06ef6c95ed114e19c864ebe0240c788dd75e85c /fs
parent60ea8226cbd5c8301f9a39edc574ddabcb8150e0 (diff)
Fix a crash when block device is read and block size is changed at the same time
The kernel may crash when block size is changed and I/O is issued simultaneously. Because some subsystems (udev or lvm) may read any block device anytime, the bug actually puts any code that changes a block device size in jeopardy. The crash can be reproduced if you place "msleep(1000)" to blkdev_get_blocks just before "bh->b_size = max_blocks << inode->i_blkbits;". Then, run "dd if=/dev/ram0 of=/dev/null bs=4k count=1 iflag=direct" While it is waiting in msleep, run "blockdev --setbsz 2048 /dev/ram0" You get a BUG. The direct and non-direct I/O is written with the assumption that block size does not change. It doesn't seem practical to fix these crashes one-by-one there may be many crash possibilities when block size changes at a certain place and it is impossible to find them all and verify the code. This patch introduces a new rw-lock bd_block_size_semaphore. The lock is taken for read during I/O. It is taken for write when changing block size. Consequently, block size can't be changed while I/O is being submitted. For asynchronous I/O, the patch only prevents block size change while the I/O is being submitted. The block size can change when the I/O is in progress or when the I/O is being finished. This is acceptable because there are no accesses to block size when asynchronous I/O is being finished. The patch prevents block size changing while the device is mapped with mmap. Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'fs')
-rw-r--r--fs/block_dev.c62
1 files changed, 60 insertions, 2 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 38e721b35d45..cdfb625824e2 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -116,6 +116,8 @@ EXPORT_SYMBOL(invalidate_bdev);
116 116
117int set_blocksize(struct block_device *bdev, int size) 117int set_blocksize(struct block_device *bdev, int size)
118{ 118{
119 struct address_space *mapping;
120
119 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 121 /* Size must be a power of two, and between 512 and PAGE_SIZE */
120 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) 122 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
121 return -EINVAL; 123 return -EINVAL;
@@ -124,6 +126,20 @@ int set_blocksize(struct block_device *bdev, int size)
124 if (size < bdev_logical_block_size(bdev)) 126 if (size < bdev_logical_block_size(bdev))
125 return -EINVAL; 127 return -EINVAL;
126 128
129 /* Prevent starting I/O or mapping the device */
130 down_write(&bdev->bd_block_size_semaphore);
131
132 /* Check that the block device is not memory mapped */
133 mapping = bdev->bd_inode->i_mapping;
134 mutex_lock(&mapping->i_mmap_mutex);
135 if (!prio_tree_empty(&mapping->i_mmap) ||
136 !list_empty(&mapping->i_mmap_nonlinear)) {
137 mutex_unlock(&mapping->i_mmap_mutex);
138 up_write(&bdev->bd_block_size_semaphore);
139 return -EBUSY;
140 }
141 mutex_unlock(&mapping->i_mmap_mutex);
142
127 /* Don't change the size if it is same as current */ 143 /* Don't change the size if it is same as current */
128 if (bdev->bd_block_size != size) { 144 if (bdev->bd_block_size != size) {
129 sync_blockdev(bdev); 145 sync_blockdev(bdev);
@@ -131,6 +147,9 @@ int set_blocksize(struct block_device *bdev, int size)
131 bdev->bd_inode->i_blkbits = blksize_bits(size); 147 bdev->bd_inode->i_blkbits = blksize_bits(size);
132 kill_bdev(bdev); 148 kill_bdev(bdev);
133 } 149 }
150
151 up_write(&bdev->bd_block_size_semaphore);
152
134 return 0; 153 return 0;
135} 154}
136 155
@@ -472,6 +491,7 @@ static void init_once(void *foo)
472 inode_init_once(&ei->vfs_inode); 491 inode_init_once(&ei->vfs_inode);
473 /* Initialize mutex for freeze. */ 492 /* Initialize mutex for freeze. */
474 mutex_init(&bdev->bd_fsfreeze_mutex); 493 mutex_init(&bdev->bd_fsfreeze_mutex);
494 init_rwsem(&bdev->bd_block_size_semaphore);
475} 495}
476 496
477static inline void __bd_forget(struct inode *inode) 497static inline void __bd_forget(struct inode *inode)
@@ -1567,6 +1587,22 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1567 return blkdev_ioctl(bdev, mode, cmd, arg); 1587 return blkdev_ioctl(bdev, mode, cmd, arg);
1568} 1588}
1569 1589
1590ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
1591 unsigned long nr_segs, loff_t pos)
1592{
1593 ssize_t ret;
1594 struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
1595
1596 down_read(&bdev->bd_block_size_semaphore);
1597
1598 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
1599
1600 up_read(&bdev->bd_block_size_semaphore);
1601
1602 return ret;
1603}
1604EXPORT_SYMBOL_GPL(blkdev_aio_read);
1605
1570/* 1606/*
1571 * Write data to the block device. Only intended for the block device itself 1607 * Write data to the block device. Only intended for the block device itself
1572 * and the raw driver which basically is a fake block device. 1608 * and the raw driver which basically is a fake block device.
@@ -1578,12 +1614,16 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1578 unsigned long nr_segs, loff_t pos) 1614 unsigned long nr_segs, loff_t pos)
1579{ 1615{
1580 struct file *file = iocb->ki_filp; 1616 struct file *file = iocb->ki_filp;
1617 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1581 struct blk_plug plug; 1618 struct blk_plug plug;
1582 ssize_t ret; 1619 ssize_t ret;
1583 1620
1584 BUG_ON(iocb->ki_pos != pos); 1621 BUG_ON(iocb->ki_pos != pos);
1585 1622
1586 blk_start_plug(&plug); 1623 blk_start_plug(&plug);
1624
1625 down_read(&bdev->bd_block_size_semaphore);
1626
1587 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 1627 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1588 if (ret > 0 || ret == -EIOCBQUEUED) { 1628 if (ret > 0 || ret == -EIOCBQUEUED) {
1589 ssize_t err; 1629 ssize_t err;
@@ -1592,11 +1632,29 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1592 if (err < 0 && ret > 0) 1632 if (err < 0 && ret > 0)
1593 ret = err; 1633 ret = err;
1594 } 1634 }
1635
1636 up_read(&bdev->bd_block_size_semaphore);
1637
1595 blk_finish_plug(&plug); 1638 blk_finish_plug(&plug);
1639
1596 return ret; 1640 return ret;
1597} 1641}
1598EXPORT_SYMBOL_GPL(blkdev_aio_write); 1642EXPORT_SYMBOL_GPL(blkdev_aio_write);
1599 1643
1644int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
1645{
1646 int ret;
1647 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1648
1649 down_read(&bdev->bd_block_size_semaphore);
1650
1651 ret = generic_file_mmap(file, vma);
1652
1653 up_read(&bdev->bd_block_size_semaphore);
1654
1655 return ret;
1656}
1657
1600/* 1658/*
1601 * Try to release a page associated with block device when the system 1659 * Try to release a page associated with block device when the system
1602 * is under memory pressure. 1660 * is under memory pressure.
@@ -1627,9 +1685,9 @@ const struct file_operations def_blk_fops = {
1627 .llseek = block_llseek, 1685 .llseek = block_llseek,
1628 .read = do_sync_read, 1686 .read = do_sync_read,
1629 .write = do_sync_write, 1687 .write = do_sync_write,
1630 .aio_read = generic_file_aio_read, 1688 .aio_read = blkdev_aio_read,
1631 .aio_write = blkdev_aio_write, 1689 .aio_write = blkdev_aio_write,
1632 .mmap = generic_file_mmap, 1690 .mmap = blkdev_mmap,
1633 .fsync = blkdev_fsync, 1691 .fsync = blkdev_fsync,
1634 .unlocked_ioctl = block_ioctl, 1692 .unlocked_ioctl = block_ioctl,
1635#ifdef CONFIG_COMPAT 1693#ifdef CONFIG_COMPAT