1 files changed, 123 insertions, 57 deletions
diff --git a/ipc/sem.c b/ipc/sem.c
index 19c8b980d1fe..8c4f59b0204a 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -253,70 +253,112 @@ static void sem_rcu_free(struct rcu_head *head)
 }
 /*
+ * Wait until all currently ongoing simple ops have completed.
+ * Caller must own sem_perm.lock.
+ * New simple ops cannot start, because simple ops first check
+ * that sem_perm.lock is free.
+ * that a) sem_perm.lock is free and b) complex_count is 0.
+ */
+static void sem_wait_array(struct sem_array *sma)
+{
+        int i;
+        struct sem *sem;
+        if (sma->complex_count)  {
+                /* The thread that increased sma->complex_count waited on
+                 * all sem->lock locks. Thus we don't need to wait again.
+                 */
+                return;
+        }
+        for (i = 0; i < sma->sem_nsems; i++) {
+                sem = sma->sem_base + i;
+                spin_unlock_wait(&sem->lock);
+        }
+}
+/*
 * If the request contains only one semaphore operation, and there are
 * no complex transactions pending, lock only the semaphore involved.
 * Otherwise, lock the entire semaphore array, since we either have
 * multiple semaphores in our own semops, or we need to look at
 * semaphores from other pending complex operations.
- *
- * Carefully guard against sma->complex_count changing between zero
- * and non-zero while we are spinning for the lock. The value of
- * sma->complex_count cannot change while we are holding the lock,
- * so sem_unlock should be fine.
- *
- * The global lock path checks that all the local locks have been released,
- * checking each local lock once. This means that the local lock paths
- * cannot start their critical sections while the global lock is held.
 */
 static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
                              int nsops)
 {
-        int locknum;
+        struct sem *sem;
- again:
-        if (nsops == 1 && !sma->complex_count) {
-                struct sem *sem = sma->sem_base + sops->sem_num;
-                /* Lock just the semaphore we are interested in. */
+        if (nsops != 1) {
-                spin_lock(&sem->lock);
+                /* Complex operation - acquire a full lock */
+                ipc_lock_object(&sma->sem_perm);
-                /*
+                /* And wait until all simple ops that are processed
-                 * If sma->complex_count was set while we were spinning,
+                 * right now have dropped their locks.
-                 * we may need to look at things we did not lock here.
                 */
-                if (unlikely(sma->complex_count)) {
+                sem_wait_array(sma);
-                        spin_unlock(&sem->lock);
+                return -1;
-                        goto lock_array;
+        }
-                }
+        /*
+         * Only one semaphore affected - try to optimize locking.
+         * The rules are:
+         * - optimized locking is possible if no complex operation
+         *   is either enqueued or processed right now.
+         * - The test for enqueued complex ops is simple:
+         *      sma->complex_count != 0
+         * - Testing for complex ops that are processed right now is
+         *   a bit more difficult. Complex ops acquire the full lock
+         *   and first wait that the running simple ops have completed.
+         *   (see above)
+         *   Thus: If we own a simple lock and the global lock is free
+         *      and complex_count is now 0, then it will stay 0 and
+         *      thus just locking sem->lock is sufficient.
+         */
+        sem = sma->sem_base + sops->sem_num;
+        if (sma->complex_count == 0) {
                /*
-                 * Another process is holding the global lock on the
+                 * It appears that no complex operation is around.
-                 * sem_array; we cannot enter our critical section,
+                 * Acquire the per-semaphore lock.
-                 * but have to wait for the global lock to be released.
                 */
-                if (unlikely(spin_is_locked(&sma->sem_perm.lock))) {
+                spin_lock(&sem->lock);
-                        spin_unlock(&sem->lock);
-                        spin_unlock_wait(&sma->sem_perm.lock);
+                /* Then check that the global lock is free */
-                        goto again;
+                if (!spin_is_locked(&sma->sem_perm.lock)) {
+                        /* spin_is_locked() is not a memory barrier */
+                        smp_mb();
+                        /* Now repeat the test of complex_count:
+                         * It can't change anymore until we drop sem->lock.
+                         * Thus: if is now 0, then it will stay 0.
+                         */
+                        if (sma->complex_count == 0) {
+                                /* fast path successful! */
+                                return sops->sem_num;
+                        }
                }
+                spin_unlock(&sem->lock);
+        }
-                locknum = sops->sem_num;
+        /* slow path: acquire the full lock */
+        ipc_lock_object(&sma->sem_perm);
+        if (sma->complex_count == 0) {
+                /* False alarm:
+                 * There is no complex operation, thus we can switch
+                 * back to the fast path.
+                 */
+                spin_lock(&sem->lock);
+                ipc_unlock_object(&sma->sem_perm);
+                return sops->sem_num;
        } else {
-                int i;
+                /* Not a false alarm, thus complete the sequence for a
-                /*
+                 * full lock.
-                 * Lock the semaphore array, and wait for all of the
-                 * individual semaphore locks to go away.  The code
-                 * above ensures no new single-lock holders will enter
-                 * their critical section while the array lock is held.
                 */
- lock_array:
+                sem_wait_array(sma);
-                ipc_lock_object(&sma->sem_perm);
+                return -1;
-                for (i = 0; i < sma->sem_nsems; i++) {
-                        struct sem *sem = sma->sem_base + i;
-                        spin_unlock_wait(&sem->lock);
-                }
-                locknum = -1;
        }
-        return locknum;
 }
 static inline void sem_unlock(struct sem_array *sma, int locknum)
@@ -876,6 +918,24 @@ again:
 }
 /**
+ * set_semotime(sma, sops) - set sem_otime
+ * @sma: semaphore array
+ * @sops: operations that modified the array, may be NULL
+ *
+ * sem_otime is replicated to avoid cache line trashing.
+ * This function sets one instance to the current time.
+ */
+static void set_semotime(struct sem_array *sma, struct sembuf *sops)
+{
+        if (sops == NULL) {
+                sma->sem_base[0].sem_otime = get_seconds();
+        } else {
+                sma->sem_base[sops[0].sem_num].sem_otime =
+                                                        get_seconds();
+        }
+}
+/**
 * do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue
 * @sma: semaphore array
 * @sops: operations that were performed
@@ -925,17 +985,10 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
                        }
                }
        }
-        if (otime) {
+        if (otime)
-                if (sops == NULL) {
+                set_semotime(sma, sops);
-                        sma->sem_base[0].sem_otime = get_seconds();
-                } else {
-                        sma->sem_base[sops[0].sem_num].sem_otime =
-                                                                get_seconds();
-                }
-        }
 }
 /* The following counts are associated to each semaphore:
 *   semncnt        number of tasks waiting on semval being nonzero
 *   semzcnt        number of tasks waiting on semval being zero
@@ -1797,12 +1850,17 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
        error = perform_atomic_semop(sma, sops, nsops, un,
                                        task_tgid_vnr(current));
-        if (error <= 0) {
+        if (error == 0) {
-                if (alter && error == 0)
+                /* If the operation was successful, then do
+                 * the required updates.
+                 */
+                if (alter)
                        do_smart_update(sma, sops, nsops, 1, &tasks);
+                else
-                goto out_unlock_free;
+                        set_semotime(sma, sops);
        }
+        if (error <= 0)
+                goto out_unlock_free;
        /* We need to sleep on this operation, so we put the current
         * task into the pending queue and go to sleep.
@@ -2061,6 +2119,14 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
        struct sem_array *sma = it;
        time_t sem_otime;
+        /*
+         * The proc interface isn't aware of sem_lock(), it calls
+         * ipc_lock_object() directly (in sysvipc_find_ipc).
+         * In order to stay compatible with sem_lock(), we must wait until
+         * all simple semop() calls have left their critical regions.
+         */
+        sem_wait_array(sma);
        sem_otime = get_semotime(sma);
        return seq_printf(s,

diff --git a/ipc/sem.c b/ipc/sem.c index 19c8b980d1fe..8c4f59b0204a 100644 --- a/ipc/sem.c +++ b/ipc/sem.c
@@ -253,70 +253,112 @@ static void sem_rcu_free(struct rcu_head *head)
253	}	253	}
254		254
255	/*	255	/*
		256	* Wait until all currently ongoing simple ops have completed.
		257	* Caller must own sem_perm.lock.
		258	* New simple ops cannot start, because simple ops first check
		259	* that sem_perm.lock is free.
		260	* that a) sem_perm.lock is free and b) complex_count is 0.
		261	*/
		262	static void sem_wait_array(struct sem_array *sma)
		263	{
		264	int i;
		265	struct sem *sem;
		266
		267	if (sma->complex_count) {
		268	/* The thread that increased sma->complex_count waited on
		269	* all sem->lock locks. Thus we don't need to wait again.
		270	*/
		271	return;
		272	}
		273
		274	for (i = 0; i < sma->sem_nsems; i++) {
		275	sem = sma->sem_base + i;
		276	spin_unlock_wait(&sem->lock);
		277	}
		278	}
		279
		280	/*
256	* If the request contains only one semaphore operation, and there are	281	* If the request contains only one semaphore operation, and there are
257	* no complex transactions pending, lock only the semaphore involved.	282	* no complex transactions pending, lock only the semaphore involved.
258	* Otherwise, lock the entire semaphore array, since we either have	283	* Otherwise, lock the entire semaphore array, since we either have
259	* multiple semaphores in our own semops, or we need to look at	284	* multiple semaphores in our own semops, or we need to look at
260	* semaphores from other pending complex operations.	285	* semaphores from other pending complex operations.
261	*
262	* Carefully guard against sma->complex_count changing between zero
263	* and non-zero while we are spinning for the lock. The value of
264	* sma->complex_count cannot change while we are holding the lock,
265	* so sem_unlock should be fine.
266	*
267	* The global lock path checks that all the local locks have been released,
268	* checking each local lock once. This means that the local lock paths
269	* cannot start their critical sections while the global lock is held.
270	*/	286	*/
271	static inline int sem_lock(struct sem_array sma, struct sembuf sops,	287	static inline int sem_lock(struct sem_array sma, struct sembuf sops,
272	int nsops)	288	int nsops)
273	{	289	{
274	int locknum;	290	struct sem *sem;
275	again:
276	if (nsops == 1 && !sma->complex_count) {
277	struct sem *sem = sma->sem_base + sops->sem_num;
278		291
279	/* Lock just the semaphore we are interested in. */	292	if (nsops != 1) {
280	spin_lock(&sem->lock);	293	/* Complex operation - acquire a full lock */
		294	ipc_lock_object(&sma->sem_perm);
281		295
282	/*	296	/* And wait until all simple ops that are processed
283	* If sma->complex_count was set while we were spinning,	297	* right now have dropped their locks.
284	* we may need to look at things we did not lock here.
285	*/	298	*/
286	if (unlikely(sma->complex_count)) {	299	sem_wait_array(sma);
287	spin_unlock(&sem->lock);	300	return -1;
288	goto lock_array;	301	}
289	}
290		302
		303	/*
		304	* Only one semaphore affected - try to optimize locking.
		305	* The rules are:
		306	* - optimized locking is possible if no complex operation
		307	* is either enqueued or processed right now.
		308	* - The test for enqueued complex ops is simple:
		309	* sma->complex_count != 0
		310	* - Testing for complex ops that are processed right now is
		311	* a bit more difficult. Complex ops acquire the full lock
		312	* and first wait that the running simple ops have completed.
		313	* (see above)
		314	* Thus: If we own a simple lock and the global lock is free
		315	* and complex_count is now 0, then it will stay 0 and
		316	* thus just locking sem->lock is sufficient.
		317	*/
		318	sem = sma->sem_base + sops->sem_num;
		319
		320	if (sma->complex_count == 0) {
291	/*	321	/*
292	* Another process is holding the global lock on the	322	* It appears that no complex operation is around.
293	* sem_array; we cannot enter our critical section,	323	* Acquire the per-semaphore lock.
294	* but have to wait for the global lock to be released.
295	*/	324	*/
296	if (unlikely(spin_is_locked(&sma->sem_perm.lock))) {	325	spin_lock(&sem->lock);
297	spin_unlock(&sem->lock);	326
298	spin_unlock_wait(&sma->sem_perm.lock);	327	/* Then check that the global lock is free */
299	goto again;	328	if (!spin_is_locked(&sma->sem_perm.lock)) {
		329	/* spin_is_locked() is not a memory barrier */
		330	smp_mb();
		331
		332	/* Now repeat the test of complex_count:
		333	* It can't change anymore until we drop sem->lock.
		334	* Thus: if is now 0, then it will stay 0.
		335	*/
		336	if (sma->complex_count == 0) {
		337	/* fast path successful! */
		338	return sops->sem_num;
		339	}
300	}	340	}
		341	spin_unlock(&sem->lock);
		342	}
301		343
302	locknum = sops->sem_num;	344	/* slow path: acquire the full lock */
		345	ipc_lock_object(&sma->sem_perm);
		346
		347	if (sma->complex_count == 0) {
		348	/* False alarm:
		349	* There is no complex operation, thus we can switch
		350	* back to the fast path.
		351	*/
		352	spin_lock(&sem->lock);
		353	ipc_unlock_object(&sma->sem_perm);
		354	return sops->sem_num;
303	} else {	355	} else {
304	int i;	356	/* Not a false alarm, thus complete the sequence for a
305	/*	357	* full lock.
306	* Lock the semaphore array, and wait for all of the
307	* individual semaphore locks to go away. The code
308	* above ensures no new single-lock holders will enter
309	* their critical section while the array lock is held.
310	*/	358	*/
311	lock_array:	359	sem_wait_array(sma);
312	ipc_lock_object(&sma->sem_perm);	360	return -1;
313	for (i = 0; i < sma->sem_nsems; i++) {
314	struct sem *sem = sma->sem_base + i;
315	spin_unlock_wait(&sem->lock);
316	}
317	locknum = -1;
318	}	361	}
319	return locknum;
320	}	362	}
321		363
322	static inline void sem_unlock(struct sem_array *sma, int locknum)	364	static inline void sem_unlock(struct sem_array *sma, int locknum)
@@ -876,6 +918,24 @@ again:
876	}	918	}
877		919
878	/**	920	/**
		921	* set_semotime(sma, sops) - set sem_otime
		922	* @sma: semaphore array
		923	* @sops: operations that modified the array, may be NULL
		924	*
		925	* sem_otime is replicated to avoid cache line trashing.
		926	* This function sets one instance to the current time.
		927	*/
		928	static void set_semotime(struct sem_array sma, struct sembuf sops)
		929	{
		930	if (sops == NULL) {
		931	sma->sem_base[0].sem_otime = get_seconds();
		932	} else {
		933	sma->sem_base[sops[0].sem_num].sem_otime =
		934	get_seconds();
		935	}
		936	}
		937
		938	/**
879	* do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue	939	* do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue
880	* @sma: semaphore array	940	* @sma: semaphore array
881	* @sops: operations that were performed	941	* @sops: operations that were performed
@@ -925,17 +985,10 @@ static void do_smart_update(struct sem_array sma, struct sembuf sops, int nsop
925	}	985	}
926	}	986	}
927	}	987	}
928	if (otime) {	988	if (otime)
929	if (sops == NULL) {	989	set_semotime(sma, sops);
930	sma->sem_base[0].sem_otime = get_seconds();
931	} else {
932	sma->sem_base[sops[0].sem_num].sem_otime =
933	get_seconds();
934	}
935	}
936	}	990	}
937		991
938
939	/* The following counts are associated to each semaphore:	992	/* The following counts are associated to each semaphore:
940	* semncnt number of tasks waiting on semval being nonzero	993	* semncnt number of tasks waiting on semval being nonzero
941	* semzcnt number of tasks waiting on semval being zero	994	* semzcnt number of tasks waiting on semval being zero
@@ -1797,12 +1850,17 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1797		1850
1798	error = perform_atomic_semop(sma, sops, nsops, un,	1851	error = perform_atomic_semop(sma, sops, nsops, un,
1799	task_tgid_vnr(current));	1852	task_tgid_vnr(current));
1800	if (error <= 0) {	1853	if (error == 0) {
1801	if (alter && error == 0)	1854	/* If the operation was successful, then do
		1855	* the required updates.
		1856	*/
		1857	if (alter)
1802	do_smart_update(sma, sops, nsops, 1, &tasks);	1858	do_smart_update(sma, sops, nsops, 1, &tasks);
1803		1859	else
1804	goto out_unlock_free;	1860	set_semotime(sma, sops);
1805	}	1861	}
		1862	if (error <= 0)
		1863	goto out_unlock_free;
1806		1864
1807	/* We need to sleep on this operation, so we put the current	1865	/* We need to sleep on this operation, so we put the current
1808	* task into the pending queue and go to sleep.	1866	* task into the pending queue and go to sleep.
@@ -2061,6 +2119,14 @@ static int sysvipc_sem_proc_show(struct seq_file s, void it)
2061	struct sem_array *sma = it;	2119	struct sem_array *sma = it;
2062	time_t sem_otime;	2120	time_t sem_otime;
2063		2121
		2122	/*
		2123	* The proc interface isn't aware of sem_lock(), it calls
		2124	* ipc_lock_object() directly (in sysvipc_find_ipc).
		2125	* In order to stay compatible with sem_lock(), we must wait until
		2126	* all simple semop() calls have left their critical regions.
		2127	*/
		2128	sem_wait_array(sma);
		2129
2064	sem_otime = get_semotime(sma);	2130	sem_otime = get_semotime(sma);
2065		2131
2066	return seq_printf(s,	2132	return seq_printf(s,