locking/rwsem: Prevent decrement of reader count before increment

During my rwsem testing, it was found that after a down_read(), the reader count may occasionally become 0 or even negative. Consequently, a writer may steal the lock at that time and execute with the reader in parallel thus breaking the mutual exclusion guarantee of the write lock. In other words, both readers and writer can become rwsem owners simultaneously. The current reader wakeup code does it in one pass to clear waiter->task and put them into wake_q before fully incrementing the reader count. Once waiter->task is cleared, the corresponding reader may see it, finish the critical section and do unlock to decrement the count before the count is incremented. This is not a problem if there is only one reader to wake up as the count has been pre-incremented by 1. It is a problem if there are more than one readers to be woken up and writer can steal the lock. The wakeup was actually done in 2 passes before the following v4.9 commit: 70800c3c0cc5 ("locking/rwsem: Scan the wait_list for readers only once") To fix this problem, the wakeup is now done in two passes again. In the first pass, we collect the readers and count them. The reader count is then fully incremented. In the second pass, the waiter->task is then cleared and they are put into wake_q to be woken up later. Signed-off-by: Waiman Long <longman@redhat.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tim Chen <tim.c.chen@linux.intel.com> Cc: Will Deacon <will.deacon@arm.com> Cc: huang ying <huang.ying.caritas@gmail.com> Fixes: 70800c3c0cc5 ("locking/rwsem: Scan the wait_list for readers only once") Link: http://lkml.kernel.org/r/20190428212557.13482-2-longman@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Waiman Long <longman@redhat.com> 2019-04-28 17:25:38 -0400
committer: Ingo Molnar <mingo@kernel.org> 2019-05-07 02:46:46 -0400
commit: a9e9bcb45b1525ba7aea26ed9441e8632aeeda58 (patch)
tree: 45f18fce3d3a1abb04ab88daf01488dcbe048b2d /kernel/locking
parent: ffa6f55eb6188ee73339cab710fabf30d13110a7 (diff)
1 files changed, 31 insertions, 15 deletions
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 6b3ee9948bf1..0b1f77957240 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -130,6 +130,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
 {
        struct rwsem_waiter *waiter, *tmp;
        long oldcount, woken = 0, adjustment = 0;
+        struct list_head wlist;
        /*
         * Take a peek at the queue head waiter such that we can determine
@@ -188,18 +189,43 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
         * of the queue. We know that woken will be at least 1 as we accounted
         * for above. Note we increment the 'active part' of the count by the
         * number of readers before waking any processes up.
+         *
+         * We have to do wakeup in 2 passes to prevent the possibility that
+         * the reader count may be decremented before it is incremented. It
+         * is because the to-be-woken waiter may not have slept yet. So it
+         * may see waiter->task got cleared, finish its critical section and
+         * do an unlock before the reader count increment.
+         *
+         * 1) Collect the read-waiters in a separate list, count them and
+         *    fully increment the reader count in rwsem.
+         * 2) For each waiters in the new list, clear waiter->task and
+         *    put them into wake_q to be woken up later.
         */
-        list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
+        list_for_each_entry(waiter, &sem->wait_list, list) {
-                struct task_struct *tsk;
                if (waiter->type == RWSEM_WAITING_FOR_WRITE)
                        break;
                woken++;
-                tsk = waiter->task;
+        }
+        list_cut_before(&wlist, &sem->wait_list, &waiter->list);
+        adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
+        lockevent_cond_inc(rwsem_wake_reader, woken);
+        if (list_empty(&sem->wait_list)) {
+                /* hit end of list above */
+                adjustment -= RWSEM_WAITING_BIAS;
+        }
+        if (adjustment)
+                atomic_long_add(adjustment, &sem->count);
+        /* 2nd pass */
+        list_for_each_entry_safe(waiter, tmp, &wlist, list) {
+                struct task_struct *tsk;
+                tsk = waiter->task;
                get_task_struct(tsk);
-                list_del(&waiter->list);
                /*
                 * Ensure calling get_task_struct() before setting the reader
                 * waiter to nil such that rwsem_down_read_failed() cannot
@@ -213,16 +239,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
                 */
                wake_q_add_safe(wake_q, tsk);
        }
-        adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
-        lockevent_cond_inc(rwsem_wake_reader, woken);
-        if (list_empty(&sem->wait_list)) {
-                /* hit end of list above */
-                adjustment -= RWSEM_WAITING_BIAS;
-        }
-        if (adjustment)
-                atomic_long_add(adjustment, &sem->count);
 }
 /*
author	Waiman Long <longman@redhat.com>	2019-04-28 17:25:38 -0400
committer	Ingo Molnar <mingo@kernel.org>	2019-05-07 02:46:46 -0400
commit	a9e9bcb45b1525ba7aea26ed9441e8632aeeda58 (patch)
tree	45f18fce3d3a1abb04ab88daf01488dcbe048b2d /kernel/locking
parent	ffa6f55eb6188ee73339cab710fabf30d13110a7 (diff)

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 6b3ee9948bf1..0b1f77957240 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c
@@ -130,6 +130,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
130	{	130	{
131	struct rwsem_waiter waiter, tmp;	131	struct rwsem_waiter waiter, tmp;
132	long oldcount, woken = 0, adjustment = 0;	132	long oldcount, woken = 0, adjustment = 0;
		133	struct list_head wlist;
133		134
134	/*	135	/*
135	* Take a peek at the queue head waiter such that we can determine	136	* Take a peek at the queue head waiter such that we can determine
@@ -188,18 +189,43 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
188	* of the queue. We know that woken will be at least 1 as we accounted	189	* of the queue. We know that woken will be at least 1 as we accounted
189	* for above. Note we increment the 'active part' of the count by the	190	* for above. Note we increment the 'active part' of the count by the
190	* number of readers before waking any processes up.	191	* number of readers before waking any processes up.
		192	*
		193	* We have to do wakeup in 2 passes to prevent the possibility that
		194	* the reader count may be decremented before it is incremented. It
		195	* is because the to-be-woken waiter may not have slept yet. So it
		196	* may see waiter->task got cleared, finish its critical section and
		197	* do an unlock before the reader count increment.
		198	*
		199	* 1) Collect the read-waiters in a separate list, count them and
		200	* fully increment the reader count in rwsem.
		201	* 2) For each waiters in the new list, clear waiter->task and
		202	* put them into wake_q to be woken up later.
191	*/	203	*/
192	list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {	204	list_for_each_entry(waiter, &sem->wait_list, list) {
193	struct task_struct *tsk;
194
195	if (waiter->type == RWSEM_WAITING_FOR_WRITE)	205	if (waiter->type == RWSEM_WAITING_FOR_WRITE)
196	break;	206	break;
197		207
198	woken++;	208	woken++;
199	tsk = waiter->task;	209	}
		210	list_cut_before(&wlist, &sem->wait_list, &waiter->list);
		211
		212	adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
		213	lockevent_cond_inc(rwsem_wake_reader, woken);
		214	if (list_empty(&sem->wait_list)) {
		215	/* hit end of list above */
		216	adjustment -= RWSEM_WAITING_BIAS;
		217	}
		218
		219	if (adjustment)
		220	atomic_long_add(adjustment, &sem->count);
		221
		222	/* 2nd pass */
		223	list_for_each_entry_safe(waiter, tmp, &wlist, list) {
		224	struct task_struct *tsk;
200		225
		226	tsk = waiter->task;
201	get_task_struct(tsk);	227	get_task_struct(tsk);
202	list_del(&waiter->list);	228
203	/*	229	/*
204	* Ensure calling get_task_struct() before setting the reader	230	* Ensure calling get_task_struct() before setting the reader
205	* waiter to nil such that rwsem_down_read_failed() cannot	231	* waiter to nil such that rwsem_down_read_failed() cannot
@@ -213,16 +239,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
213	*/	239	*/
214	wake_q_add_safe(wake_q, tsk);	240	wake_q_add_safe(wake_q, tsk);
215	}	241	}
216
217	adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
218	lockevent_cond_inc(rwsem_wake_reader, woken);
219	if (list_empty(&sem->wait_list)) {
220	/* hit end of list above */
221	adjustment -= RWSEM_WAITING_BIAS;
222	}
223
224	if (adjustment)
225	atomic_long_add(adjustment, &sem->count);
226	}	242	}
227		243
228	/*	244	/*