blockdev: turn a rw semaphore into a percpu rw semaphore

This avoids cache line bouncing when many processes lock the semaphore for read. New percpu lock implementation The lock consists of an array of percpu unsigned integers, a boolean variable and a mutex. When we take the lock for read, we enter rcu read section, check for a "locked" variable. If it is false, we increase a percpu counter on the current cpu and exit the rcu section. If "locked" is true, we exit the rcu section, take the mutex and drop it (this waits until a writer finished) and retry. Unlocking for read just decreases percpu variable. Note that we can unlock on a difference cpu than where we locked, in this case the counter underflows. The sum of all percpu counters represents the number of processes that hold the lock for read. When we need to lock for write, we take the mutex, set "locked" variable to true and synchronize rcu. Since RCU has been synchronized, no processes can create new read locks. We wait until the sum of percpu counters is zero - when it is, there are no readers in the critical section. Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
author: Mikulas Patocka <mpatocka@redhat.com> 2012-09-26 01:46:43 -0400
committer: Jens Axboe <axboe@kernel.dk> 2012-09-26 01:46:43 -0400
commit: 62ac665ff9fc07497ca524bd20d6a96893d11071 (patch)
tree: dfd697e488fde4b46f1cb2ebfb380bb881115827
parent: b87570f5d349661814b262dd5fc40787700f80d6 (diff)
4 files changed, 135 insertions, 11 deletions
diff --git a/Documentation/percpu-rw-semaphore.txt b/Documentation/percpu-rw-semaphore.txt
new file mode 100644
index 000000000000..eddd77094725
--- /dev/null
+++ b/Documentation/percpu-rw-semaphore.txt
@@ -0,0 +1,27 @@
+Percpu rw semaphores
+--------------------
+Percpu rw semaphores is a new read-write semaphore design that is
+optimized for locking for reading.
+The problem with traditional read-write semaphores is that when multiple
+cores take the lock for reading, the cache line containing the semaphore
+is bouncing between L1 caches of the cores, causing performance
+degradation.
+Locking for reading it very fast, it uses RCU and it avoids any atomic
+instruction in the lock and unlock path. On the other hand, locking for
+writing is very expensive, it calls synchronize_rcu() that can take
+hundreds of microseconds.
+The lock is declared with "struct percpu_rw_semaphore" type.
+The lock is initialized percpu_init_rwsem, it returns 0 on success and
+-ENOMEM on allocation failure.
+The lock must be freed with percpu_free_rwsem to avoid memory leak.
+The lock is locked for read with percpu_down_read, percpu_up_read and
+for write with percpu_down_write, percpu_up_write.
+The idea of using RCU for optimized rw-lock was introduced by
+Eric Dumazet <eric.dumazet@gmail.com>.
+The code was written by Mikulas Patocka <mpatocka@redhat.com>
diff --git a/fs/block_dev.c b/fs/block_dev.c
index cdfb625824e2..7eeb0635338b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -127,7 +127,7 @@ int set_blocksize(struct block_device *bdev, int size)
                return -EINVAL;
        /* Prevent starting I/O or mapping the device */
-        down_write(&bdev->bd_block_size_semaphore);
+        percpu_down_write(&bdev->bd_block_size_semaphore);
        /* Check that the block device is not memory mapped */
        mapping = bdev->bd_inode->i_mapping;
@@ -135,7 +135,7 @@ int set_blocksize(struct block_device *bdev, int size)
        if (!prio_tree_empty(&mapping->i_mmap) ||
            !list_empty(&mapping->i_mmap_nonlinear)) {
                mutex_unlock(&mapping->i_mmap_mutex);
-                up_write(&bdev->bd_block_size_semaphore);
+                percpu_up_write(&bdev->bd_block_size_semaphore);
                return -EBUSY;
        }
        mutex_unlock(&mapping->i_mmap_mutex);
@@ -148,7 +148,7 @@ int set_blocksize(struct block_device *bdev, int size)
                kill_bdev(bdev);
        }
-        up_write(&bdev->bd_block_size_semaphore);
+        percpu_up_write(&bdev->bd_block_size_semaphore);
        return 0;
 }
@@ -460,6 +460,12 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
        struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
+        if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
+                kmem_cache_free(bdev_cachep, ei);
+                return NULL;
+        }
        return &ei->vfs_inode;
 }
@@ -468,6 +474,8 @@ static void bdev_i_callback(struct rcu_head *head)
        struct inode *inode = container_of(head, struct inode, i_rcu);
        struct bdev_inode *bdi = BDEV_I(inode);
+        percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
        kmem_cache_free(bdev_cachep, bdi);
 }
@@ -491,7 +499,6 @@ static void init_once(void *foo)
        inode_init_once(&ei->vfs_inode);
        /* Initialize mutex for freeze. */
        mutex_init(&bdev->bd_fsfreeze_mutex);
-        init_rwsem(&bdev->bd_block_size_semaphore);
 }
 static inline void __bd_forget(struct inode *inode)
@@ -1593,11 +1600,11 @@ ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
        ssize_t ret;
        struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
-        down_read(&bdev->bd_block_size_semaphore);
+        percpu_down_read(&bdev->bd_block_size_semaphore);
        ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
-        up_read(&bdev->bd_block_size_semaphore);
+        percpu_up_read(&bdev->bd_block_size_semaphore);
        return ret;
 }
@@ -1622,7 +1629,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
        blk_start_plug(&plug);
-        down_read(&bdev->bd_block_size_semaphore);
+        percpu_down_read(&bdev->bd_block_size_semaphore);
        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
        if (ret > 0 || ret == -EIOCBQUEUED) {
@@ -1633,7 +1640,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
                        ret = err;
        }
-        up_read(&bdev->bd_block_size_semaphore);
+        percpu_up_read(&bdev->bd_block_size_semaphore);
        blk_finish_plug(&plug);
@@ -1646,11 +1653,11 @@ int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
        int ret;
        struct block_device *bdev = I_BDEV(file->f_mapping->host);
-        down_read(&bdev->bd_block_size_semaphore);
+        percpu_down_read(&bdev->bd_block_size_semaphore);
        ret = generic_file_mmap(file, vma);
-        up_read(&bdev->bd_block_size_semaphore);
+        percpu_up_read(&bdev->bd_block_size_semaphore);
        return ret;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e60bbd0225d5..24e1229cdfe0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -10,6 +10,7 @@
 #include <linux/ioctl.h>
 #include <linux/blk_types.h>
 #include <linux/types.h>
+#include <linux/percpu-rwsem.h>
 /*
 * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -726,7 +727,7 @@ struct block_device {
        /* Mutex for freeze */
        struct mutex            bd_fsfreeze_mutex;
        /* A semaphore that prevents I/O while block size is being changed */
-        struct rw_semaphore     bd_block_size_semaphore;
+        struct percpu_rw_semaphore      bd_block_size_semaphore;
 };
 /*
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
new file mode 100644
index 000000000000..cf80f7e5277f
--- /dev/null
+++ b/include/linux/percpu-rwsem.h
@@ -0,0 +1,89 @@
+#ifndef _LINUX_PERCPU_RWSEM_H
+#define _LINUX_PERCPU_RWSEM_H
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/delay.h>
+struct percpu_rw_semaphore {
+        unsigned __percpu *counters;
+        bool locked;
+        struct mutex mtx;
+};
+static inline void percpu_down_read(struct percpu_rw_semaphore *p)
+{
+        rcu_read_lock();
+        if (unlikely(p->locked)) {
+                rcu_read_unlock();
+                mutex_lock(&p->mtx);
+                this_cpu_inc(*p->counters);
+                mutex_unlock(&p->mtx);
+                return;
+        }
+        this_cpu_inc(*p->counters);
+        rcu_read_unlock();
+}
+static inline void percpu_up_read(struct percpu_rw_semaphore *p)
+{
+        /*
+         * On X86, write operation in this_cpu_dec serves as a memory unlock
+         * barrier (i.e. memory accesses may be moved before the write, but
+         * no memory accesses are moved past the write).
+         * On other architectures this may not be the case, so we need smp_mb()
+         * there.
+         */
+#if defined(CONFIG_X86) && (!defined(CONFIG_X86_PPRO_FENCE) && !defined(CONFIG_X86_OOSTORE))
+        barrier();
+#else
+        smp_mb();
+#endif
+        this_cpu_dec(*p->counters);
+}
+static inline unsigned __percpu_count(unsigned __percpu *counters)
+{
+        unsigned total = 0;
+        int cpu;
+        for_each_possible_cpu(cpu)
+                total += ACCESS_ONCE(*per_cpu_ptr(counters, cpu));
+        return total;
+}
+static inline void percpu_down_write(struct percpu_rw_semaphore *p)
+{
+        mutex_lock(&p->mtx);
+        p->locked = true;
+        synchronize_rcu();
+        while (__percpu_count(p->counters))
+                msleep(1);
+        smp_rmb(); /* paired with smp_mb() in percpu_sem_up_read() */
+}
+static inline void percpu_up_write(struct percpu_rw_semaphore *p)
+{
+        p->locked = false;
+        mutex_unlock(&p->mtx);
+}
+static inline int percpu_init_rwsem(struct percpu_rw_semaphore *p)
+{
+        p->counters = alloc_percpu(unsigned);
+        if (unlikely(!p->counters))
+                return -ENOMEM;
+        p->locked = false;
+        mutex_init(&p->mtx);
+        return 0;
+}
+static inline void percpu_free_rwsem(struct percpu_rw_semaphore *p)
+{
+        free_percpu(p->counters);
+        p->counters = NULL; /* catch use after free bugs */
+}
+#endif
author	Mikulas Patocka <mpatocka@redhat.com>	2012-09-26 01:46:43 -0400
committer	Jens Axboe <axboe@kernel.dk>	2012-09-26 01:46:43 -0400
commit	62ac665ff9fc07497ca524bd20d6a96893d11071 (patch)
tree	dfd697e488fde4b46f1cb2ebfb380bb881115827
parent	b87570f5d349661814b262dd5fc40787700f80d6 (diff)

diff --git a/Documentation/percpu-rw-semaphore.txt b/Documentation/percpu-rw-semaphore.txt new file mode 100644 index 000000000000..eddd77094725 --- /dev/null +++ b/Documentation/percpu-rw-semaphore.txt
@@ -0,0 +1,27 @@
		1	Percpu rw semaphores
		2	--------------------
		3
		4	Percpu rw semaphores is a new read-write semaphore design that is
		5	optimized for locking for reading.
		6
		7	The problem with traditional read-write semaphores is that when multiple
		8	cores take the lock for reading, the cache line containing the semaphore
		9	is bouncing between L1 caches of the cores, causing performance
		10	degradation.
		11
		12	Locking for reading it very fast, it uses RCU and it avoids any atomic
		13	instruction in the lock and unlock path. On the other hand, locking for
		14	writing is very expensive, it calls synchronize_rcu() that can take
		15	hundreds of microseconds.
		16
		17	The lock is declared with "struct percpu_rw_semaphore" type.
		18	The lock is initialized percpu_init_rwsem, it returns 0 on success and
		19	-ENOMEM on allocation failure.
		20	The lock must be freed with percpu_free_rwsem to avoid memory leak.
		21
		22	The lock is locked for read with percpu_down_read, percpu_up_read and
		23	for write with percpu_down_write, percpu_up_write.
		24
		25	The idea of using RCU for optimized rw-lock was introduced by
		26	Eric Dumazet <eric.dumazet@gmail.com>.
		27	The code was written by Mikulas Patocka <mpatocka@redhat.com>


diff --git a/fs/block_dev.c b/fs/block_dev.c index cdfb625824e2..7eeb0635338b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c
@@ -127,7 +127,7 @@ int set_blocksize(struct block_device *bdev, int size)
127	return -EINVAL;	127	return -EINVAL;
128		128
129	/* Prevent starting I/O or mapping the device */	129	/* Prevent starting I/O or mapping the device */
130	down_write(&bdev->bd_block_size_semaphore);	130	percpu_down_write(&bdev->bd_block_size_semaphore);
131		131
132	/* Check that the block device is not memory mapped */	132	/* Check that the block device is not memory mapped */
133	mapping = bdev->bd_inode->i_mapping;	133	mapping = bdev->bd_inode->i_mapping;
@@ -135,7 +135,7 @@ int set_blocksize(struct block_device *bdev, int size)
135	if (!prio_tree_empty(&mapping->i_mmap) \|\|	135	if (!prio_tree_empty(&mapping->i_mmap) \|\|
136	!list_empty(&mapping->i_mmap_nonlinear)) {	136	!list_empty(&mapping->i_mmap_nonlinear)) {
137	mutex_unlock(&mapping->i_mmap_mutex);	137	mutex_unlock(&mapping->i_mmap_mutex);
138	up_write(&bdev->bd_block_size_semaphore);	138	percpu_up_write(&bdev->bd_block_size_semaphore);
139	return -EBUSY;	139	return -EBUSY;
140	}	140	}
141	mutex_unlock(&mapping->i_mmap_mutex);	141	mutex_unlock(&mapping->i_mmap_mutex);
@@ -148,7 +148,7 @@ int set_blocksize(struct block_device *bdev, int size)
148	kill_bdev(bdev);	148	kill_bdev(bdev);
149	}	149	}
150		150
151	up_write(&bdev->bd_block_size_semaphore);	151	percpu_up_write(&bdev->bd_block_size_semaphore);
152		152
153	return 0;	153	return 0;
154	}	154	}
@@ -460,6 +460,12 @@ static struct inode bdev_alloc_inode(struct super_block sb)
460	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);	460	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
461	if (!ei)	461	if (!ei)
462	return NULL;	462	return NULL;
		463
		464	if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
		465	kmem_cache_free(bdev_cachep, ei);
		466	return NULL;
		467	}
		468
463	return &ei->vfs_inode;	469	return &ei->vfs_inode;
464	}	470	}
465		471
@@ -468,6 +474,8 @@ static void bdev_i_callback(struct rcu_head *head)
468	struct inode *inode = container_of(head, struct inode, i_rcu);	474	struct inode *inode = container_of(head, struct inode, i_rcu);
469	struct bdev_inode *bdi = BDEV_I(inode);	475	struct bdev_inode *bdi = BDEV_I(inode);
470		476
		477	percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
		478
471	kmem_cache_free(bdev_cachep, bdi);	479	kmem_cache_free(bdev_cachep, bdi);
472	}	480	}
473		481
@@ -491,7 +499,6 @@ static void init_once(void *foo)
491	inode_init_once(&ei->vfs_inode);	499	inode_init_once(&ei->vfs_inode);
492	/* Initialize mutex for freeze. */	500	/* Initialize mutex for freeze. */
493	mutex_init(&bdev->bd_fsfreeze_mutex);	501	mutex_init(&bdev->bd_fsfreeze_mutex);
494	init_rwsem(&bdev->bd_block_size_semaphore);
495	}	502	}
496		503
497	static inline void __bd_forget(struct inode *inode)	504	static inline void __bd_forget(struct inode *inode)
@@ -1593,11 +1600,11 @@ ssize_t blkdev_aio_read(struct kiocb iocb, const struct iovec iov,
1593	ssize_t ret;	1600	ssize_t ret;
1594	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);	1601	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
1595		1602
1596	down_read(&bdev->bd_block_size_semaphore);	1603	percpu_down_read(&bdev->bd_block_size_semaphore);
1597		1604
1598	ret = generic_file_aio_read(iocb, iov, nr_segs, pos);	1605	ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
1599		1606
1600	up_read(&bdev->bd_block_size_semaphore);	1607	percpu_up_read(&bdev->bd_block_size_semaphore);
1601		1608
1602	return ret;	1609	return ret;
1603	}	1610	}
@@ -1622,7 +1629,7 @@ ssize_t blkdev_aio_write(struct kiocb iocb, const struct iovec iov,
1622		1629
1623	blk_start_plug(&plug);	1630	blk_start_plug(&plug);
1624		1631
1625	down_read(&bdev->bd_block_size_semaphore);	1632	percpu_down_read(&bdev->bd_block_size_semaphore);
1626		1633
1627	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);	1634	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1628	if (ret > 0 \|\| ret == -EIOCBQUEUED) {	1635	if (ret > 0 \|\| ret == -EIOCBQUEUED) {
@@ -1633,7 +1640,7 @@ ssize_t blkdev_aio_write(struct kiocb iocb, const struct iovec iov,
1633	ret = err;	1640	ret = err;
1634	}	1641	}
1635		1642
1636	up_read(&bdev->bd_block_size_semaphore);	1643	percpu_up_read(&bdev->bd_block_size_semaphore);
1637		1644
1638	blk_finish_plug(&plug);	1645	blk_finish_plug(&plug);
1639		1646
@@ -1646,11 +1653,11 @@ int blkdev_mmap(struct file file, struct vm_area_struct vma)
1646	int ret;	1653	int ret;
1647	struct block_device *bdev = I_BDEV(file->f_mapping->host);	1654	struct block_device *bdev = I_BDEV(file->f_mapping->host);
1648		1655
1649	down_read(&bdev->bd_block_size_semaphore);	1656	percpu_down_read(&bdev->bd_block_size_semaphore);
1650		1657
1651	ret = generic_file_mmap(file, vma);	1658	ret = generic_file_mmap(file, vma);
1652		1659
1653	up_read(&bdev->bd_block_size_semaphore);	1660	percpu_up_read(&bdev->bd_block_size_semaphore);
1654		1661
1655	return ret;	1662	return ret;
1656	}	1663	}


diff --git a/include/linux/fs.h b/include/linux/fs.h index e60bbd0225d5..24e1229cdfe0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h
@@ -10,6 +10,7 @@
10	#include <linux/ioctl.h>	10	#include <linux/ioctl.h>
11	#include <linux/blk_types.h>	11	#include <linux/blk_types.h>
12	#include <linux/types.h>	12	#include <linux/types.h>
		13	#include <linux/percpu-rwsem.h>
13		14
14	/*	15	/*
15	* It's silly to have NR_OPEN bigger than NR_FILE, but you can change	16	* It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -726,7 +727,7 @@ struct block_device {
726	/* Mutex for freeze */	727	/* Mutex for freeze */
727	struct mutex bd_fsfreeze_mutex;	728	struct mutex bd_fsfreeze_mutex;
728	/* A semaphore that prevents I/O while block size is being changed */	729	/* A semaphore that prevents I/O while block size is being changed */
729	struct rw_semaphore bd_block_size_semaphore;	730	struct percpu_rw_semaphore bd_block_size_semaphore;
730	};	731	};
731		732
732	/*	733	/*


diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h new file mode 100644 index 000000000000..cf80f7e5277f --- /dev/null +++ b/include/linux/percpu-rwsem.h
@@ -0,0 +1,89 @@
		1	#ifndef _LINUX_PERCPU_RWSEM_H
		2	#define _LINUX_PERCPU_RWSEM_H
		3
		4	#include <linux/mutex.h>
		5	#include <linux/percpu.h>
		6	#include <linux/rcupdate.h>
		7	#include <linux/delay.h>
		8
		9	struct percpu_rw_semaphore {
		10	unsigned __percpu *counters;
		11	bool locked;
		12	struct mutex mtx;
		13	};
		14
		15	static inline void percpu_down_read(struct percpu_rw_semaphore *p)
		16	{
		17	rcu_read_lock();
		18	if (unlikely(p->locked)) {
		19	rcu_read_unlock();
		20	mutex_lock(&p->mtx);
		21	this_cpu_inc(*p->counters);
		22	mutex_unlock(&p->mtx);
		23	return;
		24	}
		25	this_cpu_inc(*p->counters);
		26	rcu_read_unlock();
		27	}
		28
		29	static inline void percpu_up_read(struct percpu_rw_semaphore *p)
		30	{
		31	/*
		32	* On X86, write operation in this_cpu_dec serves as a memory unlock
		33	* barrier (i.e. memory accesses may be moved before the write, but
		34	* no memory accesses are moved past the write).
		35	* On other architectures this may not be the case, so we need smp_mb()
		36	* there.
		37	*/
		38	#if defined(CONFIG_X86) && (!defined(CONFIG_X86_PPRO_FENCE) && !defined(CONFIG_X86_OOSTORE))
		39	barrier();
		40	#else
		41	smp_mb();
		42	#endif
		43	this_cpu_dec(*p->counters);
		44	}
		45
		46	static inline unsigned __percpu_count(unsigned __percpu *counters)
		47	{
		48	unsigned total = 0;
		49	int cpu;
		50
		51	for_each_possible_cpu(cpu)
		52	total += ACCESS_ONCE(*per_cpu_ptr(counters, cpu));
		53
		54	return total;
		55	}
		56
		57	static inline void percpu_down_write(struct percpu_rw_semaphore *p)
		58	{
		59	mutex_lock(&p->mtx);
		60	p->locked = true;
		61	synchronize_rcu();
		62	while (__percpu_count(p->counters))
		63	msleep(1);
		64	smp_rmb(); /* paired with smp_mb() in percpu_sem_up_read() */
		65	}
		66
		67	static inline void percpu_up_write(struct percpu_rw_semaphore *p)
		68	{
		69	p->locked = false;
		70	mutex_unlock(&p->mtx);
		71	}
		72
		73	static inline int percpu_init_rwsem(struct percpu_rw_semaphore *p)
		74	{
		75	p->counters = alloc_percpu(unsigned);
		76	if (unlikely(!p->counters))
		77	return -ENOMEM;
		78	p->locked = false;
		79	mutex_init(&p->mtx);
		80	return 0;
		81	}
		82
		83	static inline void percpu_free_rwsem(struct percpu_rw_semaphore *p)
		84	{
		85	free_percpu(p->counters);
		86	p->counters = NULL; /* catch use after free bugs */
		87	}
		88
		89	#endif