aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMikulas Patocka <mpatocka@redhat.com>2012-09-26 01:46:43 -0400
committerJens Axboe <axboe@kernel.dk>2012-09-26 01:46:43 -0400
commit62ac665ff9fc07497ca524bd20d6a96893d11071 (patch)
treedfd697e488fde4b46f1cb2ebfb380bb881115827
parentb87570f5d349661814b262dd5fc40787700f80d6 (diff)
blockdev: turn a rw semaphore into a percpu rw semaphore
This avoids cache line bouncing when many processes lock the semaphore for read. New percpu lock implementation The lock consists of an array of percpu unsigned integers, a boolean variable and a mutex. When we take the lock for read, we enter rcu read section, check for a "locked" variable. If it is false, we increase a percpu counter on the current cpu and exit the rcu section. If "locked" is true, we exit the rcu section, take the mutex and drop it (this waits until a writer finished) and retry. Unlocking for read just decreases percpu variable. Note that we can unlock on a difference cpu than where we locked, in this case the counter underflows. The sum of all percpu counters represents the number of processes that hold the lock for read. When we need to lock for write, we take the mutex, set "locked" variable to true and synchronize rcu. Since RCU has been synchronized, no processes can create new read locks. We wait until the sum of percpu counters is zero - when it is, there are no readers in the critical section. Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--Documentation/percpu-rw-semaphore.txt27
-rw-r--r--fs/block_dev.c27
-rw-r--r--include/linux/fs.h3
-rw-r--r--include/linux/percpu-rwsem.h89
4 files changed, 135 insertions, 11 deletions
diff --git a/Documentation/percpu-rw-semaphore.txt b/Documentation/percpu-rw-semaphore.txt
new file mode 100644
index 000000000000..eddd77094725
--- /dev/null
+++ b/Documentation/percpu-rw-semaphore.txt
@@ -0,0 +1,27 @@
1Percpu rw semaphores
2--------------------
3
4Percpu rw semaphores is a new read-write semaphore design that is
5optimized for locking for reading.
6
7The problem with traditional read-write semaphores is that when multiple
8cores take the lock for reading, the cache line containing the semaphore
9is bouncing between L1 caches of the cores, causing performance
10degradation.
11
12Locking for reading it very fast, it uses RCU and it avoids any atomic
13instruction in the lock and unlock path. On the other hand, locking for
14writing is very expensive, it calls synchronize_rcu() that can take
15hundreds of microseconds.
16
17The lock is declared with "struct percpu_rw_semaphore" type.
18The lock is initialized percpu_init_rwsem, it returns 0 on success and
19-ENOMEM on allocation failure.
20The lock must be freed with percpu_free_rwsem to avoid memory leak.
21
22The lock is locked for read with percpu_down_read, percpu_up_read and
23for write with percpu_down_write, percpu_up_write.
24
25The idea of using RCU for optimized rw-lock was introduced by
26Eric Dumazet <eric.dumazet@gmail.com>.
27The code was written by Mikulas Patocka <mpatocka@redhat.com>
diff --git a/fs/block_dev.c b/fs/block_dev.c
index cdfb625824e2..7eeb0635338b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -127,7 +127,7 @@ int set_blocksize(struct block_device *bdev, int size)
127 return -EINVAL; 127 return -EINVAL;
128 128
129 /* Prevent starting I/O or mapping the device */ 129 /* Prevent starting I/O or mapping the device */
130 down_write(&bdev->bd_block_size_semaphore); 130 percpu_down_write(&bdev->bd_block_size_semaphore);
131 131
132 /* Check that the block device is not memory mapped */ 132 /* Check that the block device is not memory mapped */
133 mapping = bdev->bd_inode->i_mapping; 133 mapping = bdev->bd_inode->i_mapping;
@@ -135,7 +135,7 @@ int set_blocksize(struct block_device *bdev, int size)
135 if (!prio_tree_empty(&mapping->i_mmap) || 135 if (!prio_tree_empty(&mapping->i_mmap) ||
136 !list_empty(&mapping->i_mmap_nonlinear)) { 136 !list_empty(&mapping->i_mmap_nonlinear)) {
137 mutex_unlock(&mapping->i_mmap_mutex); 137 mutex_unlock(&mapping->i_mmap_mutex);
138 up_write(&bdev->bd_block_size_semaphore); 138 percpu_up_write(&bdev->bd_block_size_semaphore);
139 return -EBUSY; 139 return -EBUSY;
140 } 140 }
141 mutex_unlock(&mapping->i_mmap_mutex); 141 mutex_unlock(&mapping->i_mmap_mutex);
@@ -148,7 +148,7 @@ int set_blocksize(struct block_device *bdev, int size)
148 kill_bdev(bdev); 148 kill_bdev(bdev);
149 } 149 }
150 150
151 up_write(&bdev->bd_block_size_semaphore); 151 percpu_up_write(&bdev->bd_block_size_semaphore);
152 152
153 return 0; 153 return 0;
154} 154}
@@ -460,6 +460,12 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
460 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); 460 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
461 if (!ei) 461 if (!ei)
462 return NULL; 462 return NULL;
463
464 if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
465 kmem_cache_free(bdev_cachep, ei);
466 return NULL;
467 }
468
463 return &ei->vfs_inode; 469 return &ei->vfs_inode;
464} 470}
465 471
@@ -468,6 +474,8 @@ static void bdev_i_callback(struct rcu_head *head)
468 struct inode *inode = container_of(head, struct inode, i_rcu); 474 struct inode *inode = container_of(head, struct inode, i_rcu);
469 struct bdev_inode *bdi = BDEV_I(inode); 475 struct bdev_inode *bdi = BDEV_I(inode);
470 476
477 percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
478
471 kmem_cache_free(bdev_cachep, bdi); 479 kmem_cache_free(bdev_cachep, bdi);
472} 480}
473 481
@@ -491,7 +499,6 @@ static void init_once(void *foo)
491 inode_init_once(&ei->vfs_inode); 499 inode_init_once(&ei->vfs_inode);
492 /* Initialize mutex for freeze. */ 500 /* Initialize mutex for freeze. */
493 mutex_init(&bdev->bd_fsfreeze_mutex); 501 mutex_init(&bdev->bd_fsfreeze_mutex);
494 init_rwsem(&bdev->bd_block_size_semaphore);
495} 502}
496 503
497static inline void __bd_forget(struct inode *inode) 504static inline void __bd_forget(struct inode *inode)
@@ -1593,11 +1600,11 @@ ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
1593 ssize_t ret; 1600 ssize_t ret;
1594 struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); 1601 struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
1595 1602
1596 down_read(&bdev->bd_block_size_semaphore); 1603 percpu_down_read(&bdev->bd_block_size_semaphore);
1597 1604
1598 ret = generic_file_aio_read(iocb, iov, nr_segs, pos); 1605 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
1599 1606
1600 up_read(&bdev->bd_block_size_semaphore); 1607 percpu_up_read(&bdev->bd_block_size_semaphore);
1601 1608
1602 return ret; 1609 return ret;
1603} 1610}
@@ -1622,7 +1629,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1622 1629
1623 blk_start_plug(&plug); 1630 blk_start_plug(&plug);
1624 1631
1625 down_read(&bdev->bd_block_size_semaphore); 1632 percpu_down_read(&bdev->bd_block_size_semaphore);
1626 1633
1627 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 1634 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1628 if (ret > 0 || ret == -EIOCBQUEUED) { 1635 if (ret > 0 || ret == -EIOCBQUEUED) {
@@ -1633,7 +1640,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1633 ret = err; 1640 ret = err;
1634 } 1641 }
1635 1642
1636 up_read(&bdev->bd_block_size_semaphore); 1643 percpu_up_read(&bdev->bd_block_size_semaphore);
1637 1644
1638 blk_finish_plug(&plug); 1645 blk_finish_plug(&plug);
1639 1646
@@ -1646,11 +1653,11 @@ int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
1646 int ret; 1653 int ret;
1647 struct block_device *bdev = I_BDEV(file->f_mapping->host); 1654 struct block_device *bdev = I_BDEV(file->f_mapping->host);
1648 1655
1649 down_read(&bdev->bd_block_size_semaphore); 1656 percpu_down_read(&bdev->bd_block_size_semaphore);
1650 1657
1651 ret = generic_file_mmap(file, vma); 1658 ret = generic_file_mmap(file, vma);
1652 1659
1653 up_read(&bdev->bd_block_size_semaphore); 1660 percpu_up_read(&bdev->bd_block_size_semaphore);
1654 1661
1655 return ret; 1662 return ret;
1656} 1663}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e60bbd0225d5..24e1229cdfe0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -10,6 +10,7 @@
10#include <linux/ioctl.h> 10#include <linux/ioctl.h>
11#include <linux/blk_types.h> 11#include <linux/blk_types.h>
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/percpu-rwsem.h>
13 14
14/* 15/*
15 * It's silly to have NR_OPEN bigger than NR_FILE, but you can change 16 * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -726,7 +727,7 @@ struct block_device {
726 /* Mutex for freeze */ 727 /* Mutex for freeze */
727 struct mutex bd_fsfreeze_mutex; 728 struct mutex bd_fsfreeze_mutex;
728 /* A semaphore that prevents I/O while block size is being changed */ 729 /* A semaphore that prevents I/O while block size is being changed */
729 struct rw_semaphore bd_block_size_semaphore; 730 struct percpu_rw_semaphore bd_block_size_semaphore;
730}; 731};
731 732
732/* 733/*
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
new file mode 100644
index 000000000000..cf80f7e5277f
--- /dev/null
+++ b/include/linux/percpu-rwsem.h
@@ -0,0 +1,89 @@
1#ifndef _LINUX_PERCPU_RWSEM_H
2#define _LINUX_PERCPU_RWSEM_H
3
4#include <linux/mutex.h>
5#include <linux/percpu.h>
6#include <linux/rcupdate.h>
7#include <linux/delay.h>
8
9struct percpu_rw_semaphore {
10 unsigned __percpu *counters;
11 bool locked;
12 struct mutex mtx;
13};
14
15static inline void percpu_down_read(struct percpu_rw_semaphore *p)
16{
17 rcu_read_lock();
18 if (unlikely(p->locked)) {
19 rcu_read_unlock();
20 mutex_lock(&p->mtx);
21 this_cpu_inc(*p->counters);
22 mutex_unlock(&p->mtx);
23 return;
24 }
25 this_cpu_inc(*p->counters);
26 rcu_read_unlock();
27}
28
29static inline void percpu_up_read(struct percpu_rw_semaphore *p)
30{
31 /*
32 * On X86, write operation in this_cpu_dec serves as a memory unlock
33 * barrier (i.e. memory accesses may be moved before the write, but
34 * no memory accesses are moved past the write).
35 * On other architectures this may not be the case, so we need smp_mb()
36 * there.
37 */
38#if defined(CONFIG_X86) && (!defined(CONFIG_X86_PPRO_FENCE) && !defined(CONFIG_X86_OOSTORE))
39 barrier();
40#else
41 smp_mb();
42#endif
43 this_cpu_dec(*p->counters);
44}
45
46static inline unsigned __percpu_count(unsigned __percpu *counters)
47{
48 unsigned total = 0;
49 int cpu;
50
51 for_each_possible_cpu(cpu)
52 total += ACCESS_ONCE(*per_cpu_ptr(counters, cpu));
53
54 return total;
55}
56
57static inline void percpu_down_write(struct percpu_rw_semaphore *p)
58{
59 mutex_lock(&p->mtx);
60 p->locked = true;
61 synchronize_rcu();
62 while (__percpu_count(p->counters))
63 msleep(1);
64 smp_rmb(); /* paired with smp_mb() in percpu_sem_up_read() */
65}
66
67static inline void percpu_up_write(struct percpu_rw_semaphore *p)
68{
69 p->locked = false;
70 mutex_unlock(&p->mtx);
71}
72
73static inline int percpu_init_rwsem(struct percpu_rw_semaphore *p)
74{
75 p->counters = alloc_percpu(unsigned);
76 if (unlikely(!p->counters))
77 return -ENOMEM;
78 p->locked = false;
79 mutex_init(&p->mtx);
80 return 0;
81}
82
83static inline void percpu_free_rwsem(struct percpu_rw_semaphore *p)
84{
85 free_percpu(p->counters);
86 p->counters = NULL; /* catch use after free bugs */
87}
88
89#endif