diff options
author | Mikulas Patocka <mpatocka@redhat.com> | 2012-09-26 01:46:40 -0400 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2012-09-26 01:46:40 -0400 |
commit | b87570f5d349661814b262dd5fc40787700f80d6 (patch) | |
tree | d06ef6c95ed114e19c864ebe0240c788dd75e85c /fs/block_dev.c | |
parent | 60ea8226cbd5c8301f9a39edc574ddabcb8150e0 (diff) |
Fix a crash when block device is read and block size is changed at the same time
The kernel may crash when block size is changed and I/O is issued
simultaneously.
Because some subsystems (udev or lvm) may read any block device anytime,
the bug actually puts any code that changes a block device size in
jeopardy.
The crash can be reproduced if you place "msleep(1000)" to
blkdev_get_blocks just before "bh->b_size = max_blocks <<
inode->i_blkbits;".
Then, run "dd if=/dev/ram0 of=/dev/null bs=4k count=1 iflag=direct"
While it is waiting in msleep, run "blockdev --setbsz 2048 /dev/ram0"
You get a BUG.
The direct and non-direct I/O is written with the assumption that block
size does not change. It doesn't seem practical to fix these crashes
one-by-one there may be many crash possibilities when block size changes
at a certain place and it is impossible to find them all and verify the
code.
This patch introduces a new rw-lock bd_block_size_semaphore. The lock is
taken for read during I/O. It is taken for write when changing block
size. Consequently, block size can't be changed while I/O is being
submitted.
For asynchronous I/O, the patch only prevents block size change while
the I/O is being submitted. The block size can change when the I/O is in
progress or when the I/O is being finished. This is acceptable because
there are no accesses to block size when asynchronous I/O is being
finished.
The patch prevents block size changing while the device is mapped with
mmap.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'fs/block_dev.c')
-rw-r--r-- | fs/block_dev.c | 62 |
1 files changed, 60 insertions, 2 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index 38e721b35d45..cdfb625824e2 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -116,6 +116,8 @@ EXPORT_SYMBOL(invalidate_bdev); | |||
116 | 116 | ||
117 | int set_blocksize(struct block_device *bdev, int size) | 117 | int set_blocksize(struct block_device *bdev, int size) |
118 | { | 118 | { |
119 | struct address_space *mapping; | ||
120 | |||
119 | /* Size must be a power of two, and between 512 and PAGE_SIZE */ | 121 | /* Size must be a power of two, and between 512 and PAGE_SIZE */ |
120 | if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) | 122 | if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) |
121 | return -EINVAL; | 123 | return -EINVAL; |
@@ -124,6 +126,20 @@ int set_blocksize(struct block_device *bdev, int size) | |||
124 | if (size < bdev_logical_block_size(bdev)) | 126 | if (size < bdev_logical_block_size(bdev)) |
125 | return -EINVAL; | 127 | return -EINVAL; |
126 | 128 | ||
129 | /* Prevent starting I/O or mapping the device */ | ||
130 | down_write(&bdev->bd_block_size_semaphore); | ||
131 | |||
132 | /* Check that the block device is not memory mapped */ | ||
133 | mapping = bdev->bd_inode->i_mapping; | ||
134 | mutex_lock(&mapping->i_mmap_mutex); | ||
135 | if (!prio_tree_empty(&mapping->i_mmap) || | ||
136 | !list_empty(&mapping->i_mmap_nonlinear)) { | ||
137 | mutex_unlock(&mapping->i_mmap_mutex); | ||
138 | up_write(&bdev->bd_block_size_semaphore); | ||
139 | return -EBUSY; | ||
140 | } | ||
141 | mutex_unlock(&mapping->i_mmap_mutex); | ||
142 | |||
127 | /* Don't change the size if it is same as current */ | 143 | /* Don't change the size if it is same as current */ |
128 | if (bdev->bd_block_size != size) { | 144 | if (bdev->bd_block_size != size) { |
129 | sync_blockdev(bdev); | 145 | sync_blockdev(bdev); |
@@ -131,6 +147,9 @@ int set_blocksize(struct block_device *bdev, int size) | |||
131 | bdev->bd_inode->i_blkbits = blksize_bits(size); | 147 | bdev->bd_inode->i_blkbits = blksize_bits(size); |
132 | kill_bdev(bdev); | 148 | kill_bdev(bdev); |
133 | } | 149 | } |
150 | |||
151 | up_write(&bdev->bd_block_size_semaphore); | ||
152 | |||
134 | return 0; | 153 | return 0; |
135 | } | 154 | } |
136 | 155 | ||
@@ -472,6 +491,7 @@ static void init_once(void *foo) | |||
472 | inode_init_once(&ei->vfs_inode); | 491 | inode_init_once(&ei->vfs_inode); |
473 | /* Initialize mutex for freeze. */ | 492 | /* Initialize mutex for freeze. */ |
474 | mutex_init(&bdev->bd_fsfreeze_mutex); | 493 | mutex_init(&bdev->bd_fsfreeze_mutex); |
494 | init_rwsem(&bdev->bd_block_size_semaphore); | ||
475 | } | 495 | } |
476 | 496 | ||
477 | static inline void __bd_forget(struct inode *inode) | 497 | static inline void __bd_forget(struct inode *inode) |
@@ -1567,6 +1587,22 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) | |||
1567 | return blkdev_ioctl(bdev, mode, cmd, arg); | 1587 | return blkdev_ioctl(bdev, mode, cmd, arg); |
1568 | } | 1588 | } |
1569 | 1589 | ||
1590 | ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, | ||
1591 | unsigned long nr_segs, loff_t pos) | ||
1592 | { | ||
1593 | ssize_t ret; | ||
1594 | struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); | ||
1595 | |||
1596 | down_read(&bdev->bd_block_size_semaphore); | ||
1597 | |||
1598 | ret = generic_file_aio_read(iocb, iov, nr_segs, pos); | ||
1599 | |||
1600 | up_read(&bdev->bd_block_size_semaphore); | ||
1601 | |||
1602 | return ret; | ||
1603 | } | ||
1604 | EXPORT_SYMBOL_GPL(blkdev_aio_read); | ||
1605 | |||
1570 | /* | 1606 | /* |
1571 | * Write data to the block device. Only intended for the block device itself | 1607 | * Write data to the block device. Only intended for the block device itself |
1572 | * and the raw driver which basically is a fake block device. | 1608 | * and the raw driver which basically is a fake block device. |
@@ -1578,12 +1614,16 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
1578 | unsigned long nr_segs, loff_t pos) | 1614 | unsigned long nr_segs, loff_t pos) |
1579 | { | 1615 | { |
1580 | struct file *file = iocb->ki_filp; | 1616 | struct file *file = iocb->ki_filp; |
1617 | struct block_device *bdev = I_BDEV(file->f_mapping->host); | ||
1581 | struct blk_plug plug; | 1618 | struct blk_plug plug; |
1582 | ssize_t ret; | 1619 | ssize_t ret; |
1583 | 1620 | ||
1584 | BUG_ON(iocb->ki_pos != pos); | 1621 | BUG_ON(iocb->ki_pos != pos); |
1585 | 1622 | ||
1586 | blk_start_plug(&plug); | 1623 | blk_start_plug(&plug); |
1624 | |||
1625 | down_read(&bdev->bd_block_size_semaphore); | ||
1626 | |||
1587 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); | 1627 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); |
1588 | if (ret > 0 || ret == -EIOCBQUEUED) { | 1628 | if (ret > 0 || ret == -EIOCBQUEUED) { |
1589 | ssize_t err; | 1629 | ssize_t err; |
@@ -1592,11 +1632,29 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
1592 | if (err < 0 && ret > 0) | 1632 | if (err < 0 && ret > 0) |
1593 | ret = err; | 1633 | ret = err; |
1594 | } | 1634 | } |
1635 | |||
1636 | up_read(&bdev->bd_block_size_semaphore); | ||
1637 | |||
1595 | blk_finish_plug(&plug); | 1638 | blk_finish_plug(&plug); |
1639 | |||
1596 | return ret; | 1640 | return ret; |
1597 | } | 1641 | } |
1598 | EXPORT_SYMBOL_GPL(blkdev_aio_write); | 1642 | EXPORT_SYMBOL_GPL(blkdev_aio_write); |
1599 | 1643 | ||
1644 | int blkdev_mmap(struct file *file, struct vm_area_struct *vma) | ||
1645 | { | ||
1646 | int ret; | ||
1647 | struct block_device *bdev = I_BDEV(file->f_mapping->host); | ||
1648 | |||
1649 | down_read(&bdev->bd_block_size_semaphore); | ||
1650 | |||
1651 | ret = generic_file_mmap(file, vma); | ||
1652 | |||
1653 | up_read(&bdev->bd_block_size_semaphore); | ||
1654 | |||
1655 | return ret; | ||
1656 | } | ||
1657 | |||
1600 | /* | 1658 | /* |
1601 | * Try to release a page associated with block device when the system | 1659 | * Try to release a page associated with block device when the system |
1602 | * is under memory pressure. | 1660 | * is under memory pressure. |
@@ -1627,9 +1685,9 @@ const struct file_operations def_blk_fops = { | |||
1627 | .llseek = block_llseek, | 1685 | .llseek = block_llseek, |
1628 | .read = do_sync_read, | 1686 | .read = do_sync_read, |
1629 | .write = do_sync_write, | 1687 | .write = do_sync_write, |
1630 | .aio_read = generic_file_aio_read, | 1688 | .aio_read = blkdev_aio_read, |
1631 | .aio_write = blkdev_aio_write, | 1689 | .aio_write = blkdev_aio_write, |
1632 | .mmap = generic_file_mmap, | 1690 | .mmap = blkdev_mmap, |
1633 | .fsync = blkdev_fsync, | 1691 | .fsync = blkdev_fsync, |
1634 | .unlocked_ioctl = block_ioctl, | 1692 | .unlocked_ioctl = block_ioctl, |
1635 | #ifdef CONFIG_COMPAT | 1693 | #ifdef CONFIG_COMPAT |