aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2011-11-02 16:38:39 -0400
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2012-08-01 15:27:19 -0400
commit6b63ea81d831b2b6f2ce6d60cfcfa05b1858ad97 (patch)
treecb304df336644ded568318451fb8feee27708d78 /kernel
parent4d01a2e38a9c27d3de083ecc561b32b6a55fc7eb (diff)
cpusets: avoid looping when storing to mems_allowed if one node remains set
commit 89e8a244b97e48f1f30e898b6f32acca477f2a13 upstream. Stable note: Not tracked in Bugzilla. [get|put]_mems_allowed() is extremely expensive and severely impacted page allocator performance. This is part of a series of patches that reduce page allocator overhead. {get,put}_mems_allowed() exist so that general kernel code may locklessly access a task's set of allowable nodes without having the chance that a concurrent write will cause the nodemask to be empty on configurations where MAX_NUMNODES > BITS_PER_LONG. This could incur a significant delay, however, especially in low memory conditions because the page allocator is blocking and reclaim requires get_mems_allowed() itself. It is not atypical to see writes to cpuset.mems take over 2 seconds to complete, for example. In low memory conditions, this is problematic because it's one of the most imporant times to change cpuset.mems in the first place! The only way a task's set of allowable nodes may change is through cpusets by writing to cpuset.mems and when attaching a task to a generic code is not reading the nodemask with get_mems_allowed() at the same time, and then clearing all the old nodes. This prevents the possibility that a reader will see an empty nodemask at the same time the writer is storing a new nodemask. If at least one node remains unchanged, though, it's possible to simply set all new nodes and then clear all the old nodes. Changing a task's nodemask is protected by cgroup_mutex so it's guaranteed that two threads are not changing the same task's nodemask at the same time, so the nodemask is guaranteed to be stored before another thread changes it and determines whether a node remains set or not. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Paul Menage <paul@paulmenage.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpuset.c9
1 files changed, 6 insertions, 3 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9c9b7545c81..a9958936d89 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -949,6 +949,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
949static void cpuset_change_task_nodemask(struct task_struct *tsk, 949static void cpuset_change_task_nodemask(struct task_struct *tsk,
950 nodemask_t *newmems) 950 nodemask_t *newmems)
951{ 951{
952 bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed);
953
952repeat: 954repeat:
953 /* 955 /*
954 * Allow tasks that have access to memory reserves because they have 956 * Allow tasks that have access to memory reserves because they have
@@ -963,7 +965,6 @@ repeat:
963 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 965 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
964 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); 966 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
965 967
966
967 /* 968 /*
968 * ensure checking ->mems_allowed_change_disable after setting all new 969 * ensure checking ->mems_allowed_change_disable after setting all new
969 * allowed nodes. 970 * allowed nodes.
@@ -980,9 +981,11 @@ repeat:
980 981
981 /* 982 /*
982 * Allocation of memory is very fast, we needn't sleep when waiting 983 * Allocation of memory is very fast, we needn't sleep when waiting
983 * for the read-side. 984 * for the read-side. No wait is necessary, however, if at least one
985 * node remains unchanged.
984 */ 986 */
985 while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { 987 while (masks_disjoint &&
988 ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
986 task_unlock(tsk); 989 task_unlock(tsk);
987 if (!task_curr(tsk)) 990 if (!task_curr(tsk))
988 yield(); 991 yield();