aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorMikulas Patocka <mpatocka@redhat.com>2012-09-26 01:46:43 -0400
committerJens Axboe <axboe@kernel.dk>2012-09-26 01:46:43 -0400
commit62ac665ff9fc07497ca524bd20d6a96893d11071 (patch)
treedfd697e488fde4b46f1cb2ebfb380bb881115827 /fs
parentb87570f5d349661814b262dd5fc40787700f80d6 (diff)
blockdev: turn a rw semaphore into a percpu rw semaphore
This avoids cache line bouncing when many processes lock the semaphore for read. New percpu lock implementation The lock consists of an array of percpu unsigned integers, a boolean variable and a mutex. When we take the lock for read, we enter rcu read section, check for a "locked" variable. If it is false, we increase a percpu counter on the current cpu and exit the rcu section. If "locked" is true, we exit the rcu section, take the mutex and drop it (this waits until a writer finished) and retry. Unlocking for read just decreases percpu variable. Note that we can unlock on a difference cpu than where we locked, in this case the counter underflows. The sum of all percpu counters represents the number of processes that hold the lock for read. When we need to lock for write, we take the mutex, set "locked" variable to true and synchronize rcu. Since RCU has been synchronized, no processes can create new read locks. We wait until the sum of percpu counters is zero - when it is, there are no readers in the critical section. Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'fs')
-rw-r--r--fs/block_dev.c27
1 files changed, 17 insertions, 10 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index cdfb625824e2..7eeb0635338b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -127,7 +127,7 @@ int set_blocksize(struct block_device *bdev, int size)
127 return -EINVAL; 127 return -EINVAL;
128 128
129 /* Prevent starting I/O or mapping the device */ 129 /* Prevent starting I/O or mapping the device */
130 down_write(&bdev->bd_block_size_semaphore); 130 percpu_down_write(&bdev->bd_block_size_semaphore);
131 131
132 /* Check that the block device is not memory mapped */ 132 /* Check that the block device is not memory mapped */
133 mapping = bdev->bd_inode->i_mapping; 133 mapping = bdev->bd_inode->i_mapping;
@@ -135,7 +135,7 @@ int set_blocksize(struct block_device *bdev, int size)
135 if (!prio_tree_empty(&mapping->i_mmap) || 135 if (!prio_tree_empty(&mapping->i_mmap) ||
136 !list_empty(&mapping->i_mmap_nonlinear)) { 136 !list_empty(&mapping->i_mmap_nonlinear)) {
137 mutex_unlock(&mapping->i_mmap_mutex); 137 mutex_unlock(&mapping->i_mmap_mutex);
138 up_write(&bdev->bd_block_size_semaphore); 138 percpu_up_write(&bdev->bd_block_size_semaphore);
139 return -EBUSY; 139 return -EBUSY;
140 } 140 }
141 mutex_unlock(&mapping->i_mmap_mutex); 141 mutex_unlock(&mapping->i_mmap_mutex);
@@ -148,7 +148,7 @@ int set_blocksize(struct block_device *bdev, int size)
148 kill_bdev(bdev); 148 kill_bdev(bdev);
149 } 149 }
150 150
151 up_write(&bdev->bd_block_size_semaphore); 151 percpu_up_write(&bdev->bd_block_size_semaphore);
152 152
153 return 0; 153 return 0;
154} 154}
@@ -460,6 +460,12 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
460 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); 460 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
461 if (!ei) 461 if (!ei)
462 return NULL; 462 return NULL;
463
464 if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
465 kmem_cache_free(bdev_cachep, ei);
466 return NULL;
467 }
468
463 return &ei->vfs_inode; 469 return &ei->vfs_inode;
464} 470}
465 471
@@ -468,6 +474,8 @@ static void bdev_i_callback(struct rcu_head *head)
468 struct inode *inode = container_of(head, struct inode, i_rcu); 474 struct inode *inode = container_of(head, struct inode, i_rcu);
469 struct bdev_inode *bdi = BDEV_I(inode); 475 struct bdev_inode *bdi = BDEV_I(inode);
470 476
477 percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
478
471 kmem_cache_free(bdev_cachep, bdi); 479 kmem_cache_free(bdev_cachep, bdi);
472} 480}
473 481
@@ -491,7 +499,6 @@ static void init_once(void *foo)
491 inode_init_once(&ei->vfs_inode); 499 inode_init_once(&ei->vfs_inode);
492 /* Initialize mutex for freeze. */ 500 /* Initialize mutex for freeze. */
493 mutex_init(&bdev->bd_fsfreeze_mutex); 501 mutex_init(&bdev->bd_fsfreeze_mutex);
494 init_rwsem(&bdev->bd_block_size_semaphore);
495} 502}
496 503
497static inline void __bd_forget(struct inode *inode) 504static inline void __bd_forget(struct inode *inode)
@@ -1593,11 +1600,11 @@ ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
1593 ssize_t ret; 1600 ssize_t ret;
1594 struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); 1601 struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
1595 1602
1596 down_read(&bdev->bd_block_size_semaphore); 1603 percpu_down_read(&bdev->bd_block_size_semaphore);
1597 1604
1598 ret = generic_file_aio_read(iocb, iov, nr_segs, pos); 1605 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
1599 1606
1600 up_read(&bdev->bd_block_size_semaphore); 1607 percpu_up_read(&bdev->bd_block_size_semaphore);
1601 1608
1602 return ret; 1609 return ret;
1603} 1610}
@@ -1622,7 +1629,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1622 1629
1623 blk_start_plug(&plug); 1630 blk_start_plug(&plug);
1624 1631
1625 down_read(&bdev->bd_block_size_semaphore); 1632 percpu_down_read(&bdev->bd_block_size_semaphore);
1626 1633
1627 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 1634 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1628 if (ret > 0 || ret == -EIOCBQUEUED) { 1635 if (ret > 0 || ret == -EIOCBQUEUED) {
@@ -1633,7 +1640,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1633 ret = err; 1640 ret = err;
1634 } 1641 }
1635 1642
1636 up_read(&bdev->bd_block_size_semaphore); 1643 percpu_up_read(&bdev->bd_block_size_semaphore);
1637 1644
1638 blk_finish_plug(&plug); 1645 blk_finish_plug(&plug);
1639 1646
@@ -1646,11 +1653,11 @@ int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
1646 int ret; 1653 int ret;
1647 struct block_device *bdev = I_BDEV(file->f_mapping->host); 1654 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1648 1655
1649 down_read(&bdev->bd_block_size_semaphore); 1656 percpu_down_read(&bdev->bd_block_size_semaphore);
1650 1657
1651 ret = generic_file_mmap(file, vma); 1658 ret = generic_file_mmap(file, vma);
1652 1659
1653 up_read(&bdev->bd_block_size_semaphore); 1660 percpu_up_read(&bdev->bd_block_size_semaphore);
1654 1661
1655 return ret; 1662 return ret;
1656} 1663}