diff options
-rw-r--r-- | Documentation/percpu-rw-semaphore.txt | 27 | ||||
-rw-r--r-- | fs/block_dev.c | 27 | ||||
-rw-r--r-- | include/linux/fs.h | 3 | ||||
-rw-r--r-- | include/linux/percpu-rwsem.h | 89 |
4 files changed, 135 insertions, 11 deletions
diff --git a/Documentation/percpu-rw-semaphore.txt b/Documentation/percpu-rw-semaphore.txt new file mode 100644 index 00000000000..eddd7709472 --- /dev/null +++ b/Documentation/percpu-rw-semaphore.txt | |||
@@ -0,0 +1,27 @@ | |||
1 | Percpu rw semaphores | ||
2 | -------------------- | ||
3 | |||
4 | Percpu rw semaphores is a new read-write semaphore design that is | ||
5 | optimized for locking for reading. | ||
6 | |||
7 | The problem with traditional read-write semaphores is that when multiple | ||
8 | cores take the lock for reading, the cache line containing the semaphore | ||
9 | is bouncing between L1 caches of the cores, causing performance | ||
10 | degradation. | ||
11 | |||
12 | Locking for reading it very fast, it uses RCU and it avoids any atomic | ||
13 | instruction in the lock and unlock path. On the other hand, locking for | ||
14 | writing is very expensive, it calls synchronize_rcu() that can take | ||
15 | hundreds of microseconds. | ||
16 | |||
17 | The lock is declared with "struct percpu_rw_semaphore" type. | ||
18 | The lock is initialized percpu_init_rwsem, it returns 0 on success and | ||
19 | -ENOMEM on allocation failure. | ||
20 | The lock must be freed with percpu_free_rwsem to avoid memory leak. | ||
21 | |||
22 | The lock is locked for read with percpu_down_read, percpu_up_read and | ||
23 | for write with percpu_down_write, percpu_up_write. | ||
24 | |||
25 | The idea of using RCU for optimized rw-lock was introduced by | ||
26 | Eric Dumazet <eric.dumazet@gmail.com>. | ||
27 | The code was written by Mikulas Patocka <mpatocka@redhat.com> | ||
diff --git a/fs/block_dev.c b/fs/block_dev.c index cdfb625824e..7eeb0635338 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -127,7 +127,7 @@ int set_blocksize(struct block_device *bdev, int size) | |||
127 | return -EINVAL; | 127 | return -EINVAL; |
128 | 128 | ||
129 | /* Prevent starting I/O or mapping the device */ | 129 | /* Prevent starting I/O or mapping the device */ |
130 | down_write(&bdev->bd_block_size_semaphore); | 130 | percpu_down_write(&bdev->bd_block_size_semaphore); |
131 | 131 | ||
132 | /* Check that the block device is not memory mapped */ | 132 | /* Check that the block device is not memory mapped */ |
133 | mapping = bdev->bd_inode->i_mapping; | 133 | mapping = bdev->bd_inode->i_mapping; |
@@ -135,7 +135,7 @@ int set_blocksize(struct block_device *bdev, int size) | |||
135 | if (!prio_tree_empty(&mapping->i_mmap) || | 135 | if (!prio_tree_empty(&mapping->i_mmap) || |
136 | !list_empty(&mapping->i_mmap_nonlinear)) { | 136 | !list_empty(&mapping->i_mmap_nonlinear)) { |
137 | mutex_unlock(&mapping->i_mmap_mutex); | 137 | mutex_unlock(&mapping->i_mmap_mutex); |
138 | up_write(&bdev->bd_block_size_semaphore); | 138 | percpu_up_write(&bdev->bd_block_size_semaphore); |
139 | return -EBUSY; | 139 | return -EBUSY; |
140 | } | 140 | } |
141 | mutex_unlock(&mapping->i_mmap_mutex); | 141 | mutex_unlock(&mapping->i_mmap_mutex); |
@@ -148,7 +148,7 @@ int set_blocksize(struct block_device *bdev, int size) | |||
148 | kill_bdev(bdev); | 148 | kill_bdev(bdev); |
149 | } | 149 | } |
150 | 150 | ||
151 | up_write(&bdev->bd_block_size_semaphore); | 151 | percpu_up_write(&bdev->bd_block_size_semaphore); |
152 | 152 | ||
153 | return 0; | 153 | return 0; |
154 | } | 154 | } |
@@ -460,6 +460,12 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) | |||
460 | struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); | 460 | struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); |
461 | if (!ei) | 461 | if (!ei) |
462 | return NULL; | 462 | return NULL; |
463 | |||
464 | if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) { | ||
465 | kmem_cache_free(bdev_cachep, ei); | ||
466 | return NULL; | ||
467 | } | ||
468 | |||
463 | return &ei->vfs_inode; | 469 | return &ei->vfs_inode; |
464 | } | 470 | } |
465 | 471 | ||
@@ -468,6 +474,8 @@ static void bdev_i_callback(struct rcu_head *head) | |||
468 | struct inode *inode = container_of(head, struct inode, i_rcu); | 474 | struct inode *inode = container_of(head, struct inode, i_rcu); |
469 | struct bdev_inode *bdi = BDEV_I(inode); | 475 | struct bdev_inode *bdi = BDEV_I(inode); |
470 | 476 | ||
477 | percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore); | ||
478 | |||
471 | kmem_cache_free(bdev_cachep, bdi); | 479 | kmem_cache_free(bdev_cachep, bdi); |
472 | } | 480 | } |
473 | 481 | ||
@@ -491,7 +499,6 @@ static void init_once(void *foo) | |||
491 | inode_init_once(&ei->vfs_inode); | 499 | inode_init_once(&ei->vfs_inode); |
492 | /* Initialize mutex for freeze. */ | 500 | /* Initialize mutex for freeze. */ |
493 | mutex_init(&bdev->bd_fsfreeze_mutex); | 501 | mutex_init(&bdev->bd_fsfreeze_mutex); |
494 | init_rwsem(&bdev->bd_block_size_semaphore); | ||
495 | } | 502 | } |
496 | 503 | ||
497 | static inline void __bd_forget(struct inode *inode) | 504 | static inline void __bd_forget(struct inode *inode) |
@@ -1593,11 +1600,11 @@ ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1593 | ssize_t ret; | 1600 | ssize_t ret; |
1594 | struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); | 1601 | struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); |
1595 | 1602 | ||
1596 | down_read(&bdev->bd_block_size_semaphore); | 1603 | percpu_down_read(&bdev->bd_block_size_semaphore); |
1597 | 1604 | ||
1598 | ret = generic_file_aio_read(iocb, iov, nr_segs, pos); | 1605 | ret = generic_file_aio_read(iocb, iov, nr_segs, pos); |
1599 | 1606 | ||
1600 | up_read(&bdev->bd_block_size_semaphore); | 1607 | percpu_up_read(&bdev->bd_block_size_semaphore); |
1601 | 1608 | ||
1602 | return ret; | 1609 | return ret; |
1603 | } | 1610 | } |
@@ -1622,7 +1629,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
1622 | 1629 | ||
1623 | blk_start_plug(&plug); | 1630 | blk_start_plug(&plug); |
1624 | 1631 | ||
1625 | down_read(&bdev->bd_block_size_semaphore); | 1632 | percpu_down_read(&bdev->bd_block_size_semaphore); |
1626 | 1633 | ||
1627 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); | 1634 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); |
1628 | if (ret > 0 || ret == -EIOCBQUEUED) { | 1635 | if (ret > 0 || ret == -EIOCBQUEUED) { |
@@ -1633,7 +1640,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
1633 | ret = err; | 1640 | ret = err; |
1634 | } | 1641 | } |
1635 | 1642 | ||
1636 | up_read(&bdev->bd_block_size_semaphore); | 1643 | percpu_up_read(&bdev->bd_block_size_semaphore); |
1637 | 1644 | ||
1638 | blk_finish_plug(&plug); | 1645 | blk_finish_plug(&plug); |
1639 | 1646 | ||
@@ -1646,11 +1653,11 @@ int blkdev_mmap(struct file *file, struct vm_area_struct *vma) | |||
1646 | int ret; | 1653 | int ret; |
1647 | struct block_device *bdev = I_BDEV(file->f_mapping->host); | 1654 | struct block_device *bdev = I_BDEV(file->f_mapping->host); |
1648 | 1655 | ||
1649 | down_read(&bdev->bd_block_size_semaphore); | 1656 | percpu_down_read(&bdev->bd_block_size_semaphore); |
1650 | 1657 | ||
1651 | ret = generic_file_mmap(file, vma); | 1658 | ret = generic_file_mmap(file, vma); |
1652 | 1659 | ||
1653 | up_read(&bdev->bd_block_size_semaphore); | 1660 | percpu_up_read(&bdev->bd_block_size_semaphore); |
1654 | 1661 | ||
1655 | return ret; | 1662 | return ret; |
1656 | } | 1663 | } |
diff --git a/include/linux/fs.h b/include/linux/fs.h index e60bbd0225d..24e1229cdfe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/ioctl.h> | 10 | #include <linux/ioctl.h> |
11 | #include <linux/blk_types.h> | 11 | #include <linux/blk_types.h> |
12 | #include <linux/types.h> | 12 | #include <linux/types.h> |
13 | #include <linux/percpu-rwsem.h> | ||
13 | 14 | ||
14 | /* | 15 | /* |
15 | * It's silly to have NR_OPEN bigger than NR_FILE, but you can change | 16 | * It's silly to have NR_OPEN bigger than NR_FILE, but you can change |
@@ -726,7 +727,7 @@ struct block_device { | |||
726 | /* Mutex for freeze */ | 727 | /* Mutex for freeze */ |
727 | struct mutex bd_fsfreeze_mutex; | 728 | struct mutex bd_fsfreeze_mutex; |
728 | /* A semaphore that prevents I/O while block size is being changed */ | 729 | /* A semaphore that prevents I/O while block size is being changed */ |
729 | struct rw_semaphore bd_block_size_semaphore; | 730 | struct percpu_rw_semaphore bd_block_size_semaphore; |
730 | }; | 731 | }; |
731 | 732 | ||
732 | /* | 733 | /* |
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h new file mode 100644 index 00000000000..cf80f7e5277 --- /dev/null +++ b/include/linux/percpu-rwsem.h | |||
@@ -0,0 +1,89 @@ | |||
1 | #ifndef _LINUX_PERCPU_RWSEM_H | ||
2 | #define _LINUX_PERCPU_RWSEM_H | ||
3 | |||
4 | #include <linux/mutex.h> | ||
5 | #include <linux/percpu.h> | ||
6 | #include <linux/rcupdate.h> | ||
7 | #include <linux/delay.h> | ||
8 | |||
9 | struct percpu_rw_semaphore { | ||
10 | unsigned __percpu *counters; | ||
11 | bool locked; | ||
12 | struct mutex mtx; | ||
13 | }; | ||
14 | |||
15 | static inline void percpu_down_read(struct percpu_rw_semaphore *p) | ||
16 | { | ||
17 | rcu_read_lock(); | ||
18 | if (unlikely(p->locked)) { | ||
19 | rcu_read_unlock(); | ||
20 | mutex_lock(&p->mtx); | ||
21 | this_cpu_inc(*p->counters); | ||
22 | mutex_unlock(&p->mtx); | ||
23 | return; | ||
24 | } | ||
25 | this_cpu_inc(*p->counters); | ||
26 | rcu_read_unlock(); | ||
27 | } | ||
28 | |||
29 | static inline void percpu_up_read(struct percpu_rw_semaphore *p) | ||
30 | { | ||
31 | /* | ||
32 | * On X86, write operation in this_cpu_dec serves as a memory unlock | ||
33 | * barrier (i.e. memory accesses may be moved before the write, but | ||
34 | * no memory accesses are moved past the write). | ||
35 | * On other architectures this may not be the case, so we need smp_mb() | ||
36 | * there. | ||
37 | */ | ||
38 | #if defined(CONFIG_X86) && (!defined(CONFIG_X86_PPRO_FENCE) && !defined(CONFIG_X86_OOSTORE)) | ||
39 | barrier(); | ||
40 | #else | ||
41 | smp_mb(); | ||
42 | #endif | ||
43 | this_cpu_dec(*p->counters); | ||
44 | } | ||
45 | |||
46 | static inline unsigned __percpu_count(unsigned __percpu *counters) | ||
47 | { | ||
48 | unsigned total = 0; | ||
49 | int cpu; | ||
50 | |||
51 | for_each_possible_cpu(cpu) | ||
52 | total += ACCESS_ONCE(*per_cpu_ptr(counters, cpu)); | ||
53 | |||
54 | return total; | ||
55 | } | ||
56 | |||
57 | static inline void percpu_down_write(struct percpu_rw_semaphore *p) | ||
58 | { | ||
59 | mutex_lock(&p->mtx); | ||
60 | p->locked = true; | ||
61 | synchronize_rcu(); | ||
62 | while (__percpu_count(p->counters)) | ||
63 | msleep(1); | ||
64 | smp_rmb(); /* paired with smp_mb() in percpu_sem_up_read() */ | ||
65 | } | ||
66 | |||
67 | static inline void percpu_up_write(struct percpu_rw_semaphore *p) | ||
68 | { | ||
69 | p->locked = false; | ||
70 | mutex_unlock(&p->mtx); | ||
71 | } | ||
72 | |||
73 | static inline int percpu_init_rwsem(struct percpu_rw_semaphore *p) | ||
74 | { | ||
75 | p->counters = alloc_percpu(unsigned); | ||
76 | if (unlikely(!p->counters)) | ||
77 | return -ENOMEM; | ||
78 | p->locked = false; | ||
79 | mutex_init(&p->mtx); | ||
80 | return 0; | ||
81 | } | ||
82 | |||
83 | static inline void percpu_free_rwsem(struct percpu_rw_semaphore *p) | ||
84 | { | ||
85 | free_percpu(p->counters); | ||
86 | p->counters = NULL; /* catch use after free bugs */ | ||
87 | } | ||
88 | |||
89 | #endif | ||