From c0ff7453bb5c7c98e0885fb94279f2571946f280 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 24 May 2010 14:32:08 -0700 Subject: cpuset,mm: fix no node to alloc memory when changing cpuset's mems Before applying this patch, cpuset updates task->mems_allowed and mempolicy by setting all new bits in the nodemask first, and clearing all old unallowed bits later. But in the way, the allocator may find that there is no node to alloc memory. The reason is that cpuset rebinds the task's mempolicy, it cleans the nodes which the allocater can alloc pages on, for example: (mpol: mempolicy) task1 task1's mpol task2 alloc page 1 alloc on node0? NO 1 1 change mems from 1 to 0 1 rebind task1's mpol 0-1 set new bits 0 clear disallowed bits alloc on node1? NO 0 ... can't alloc page goto oom This patch fixes this problem by expanding the nodes range first(set newly allowed bits) and shrink it lazily(clear newly disallowed bits). So we use a variable to tell the write-side task that read-side task is reading nodemask, and the write-side task clears newly disallowed nodes after read-side task ends the current memory allocation. [akpm@linux-foundation.org: fix spello] Signed-off-by: Miao Xie Cc: David Rientjes Cc: Nick Piggin Cc: Paul Menage Cc: Lee Schermerhorn Cc: Hugh Dickins Cc: Ravikiran Thirumalai Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b55e988988b5..415b8f8a3f45 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1421,6 +1421,7 @@ struct task_struct { #endif #ifdef CONFIG_CPUSETS nodemask_t mems_allowed; /* Protected by alloc_lock */ + int mems_allowed_change_disable; int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CGROUPS -- cgit v1.2.2 From 4be929be34f9bdeffa40d815d32d7d60d2c7f03b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 24 May 2010 14:33:03 -0700 Subject: kernel-wide: replace USHORT_MAX, SHORT_MAX and SHORT_MIN with USHRT_MAX, SHRT_MAX and SHRT_MIN - C99 knows about USHRT_MAX/SHRT_MAX/SHRT_MIN, not USHORT_MAX/SHORT_MAX/SHORT_MIN. - Make SHRT_MIN of type s16, not int, for consistency. [akpm@linux-foundation.org: fix drivers/dma/timb_dma.c] [akpm@linux-foundation.org: fix security/keys/keyring.c] Signed-off-by: Alexey Dobriyan Acked-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 415b8f8a3f45..c0151ffd3541 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -384,7 +384,7 @@ struct user_namespace; * 1-3 now and depends on arch. We use "5" as safe margin, here. */ #define MAPCOUNT_ELF_CORE_MARGIN (5) -#define DEFAULT_MAX_MAP_COUNT (USHORT_MAX - MAPCOUNT_ELF_CORE_MARGIN) +#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) extern int sysctl_max_map_count; -- cgit v1.2.2 From 6adef3ebe570bcde67fd6c16101451ddde5712b5 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Wed, 26 May 2010 14:42:49 -0700 Subject: cpusets: new round-robin rotor for SLAB allocations We have observed several workloads running on multi-node systems where memory is assigned unevenly across the nodes in the system. There are numerous reasons for this but one is the round-robin rotor in cpuset_mem_spread_node(). For example, a simple test that writes a multi-page file will allocate pages on nodes 0 2 4 6 ... Odd nodes are skipped. (Sometimes it allocates on odd nodes & skips even nodes). An example is shown below. The program "lfile" writes a file consisting of 10 pages. The program then mmaps the file & uses get_mempolicy(..., MPOL_F_NODE) to determine the nodes where the file pages were allocated. The output is shown below: # ./lfile allocated on nodes: 2 4 6 0 1 2 6 0 2 There is a single rotor that is used for allocating both file pages & slab pages. Writing the file allocates both a data page & a slab page (buffer_head). This advances the RR rotor 2 nodes for each page allocated. A quick confirmation seems to confirm this is the cause of the uneven allocation: # echo 0 >/dev/cpuset/memory_spread_slab # ./lfile allocated on nodes: 6 7 8 9 0 1 2 3 4 5 This patch introduces a second rotor that is used for slab allocations. Signed-off-by: Jack Steiner Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Paul Menage Cc: Jack Steiner Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index c0151ffd3541..4f31a166b1a1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1423,6 +1423,7 @@ struct task_struct { nodemask_t mems_allowed; /* Protected by alloc_lock */ int mems_allowed_change_disable; int cpuset_mem_spread_rotor; + int cpuset_slab_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock */ -- cgit v1.2.2 From 09faef11df8c559a23e2405d123cb2683733a79a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:11 -0700 Subject: exit: change zap_other_threads() to count sub-threads Change zap_other_threads() to return the number of other sub-threads found on ->thread_group list. Other changes are cosmetic: - change the code to use while_each_thread() helper - remove the obsolete comment about SIGKILL/SIGSTOP Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Veaceslav Falico Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4f31a166b1a1..a95a2455cebe 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2036,7 +2036,7 @@ extern int do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int, struct task_struct *); extern int send_sig(int, struct task_struct *, int); -extern void zap_other_threads(struct task_struct *p); +extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group); -- cgit v1.2.2 From ea6d290ca34c4fd91b7348338c0cc7bdeff94a35 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:16 -0700 Subject: signals: make task_struct->signal immutable/refcountable We have a lot of problems with accessing task_struct->signal, it can "disappear" at any moment. Even current can't use its ->signal safely after exit_notify(). ->siglock helps, but it is not convenient, not always possible, and sometimes it makes sense to use task->signal even after this task has already dead. This patch adds the reference counter, sigcnt, into signal_struct. This reference is owned by task_struct and it is dropped in __put_task_struct(). Perhaps it makes sense to export get/put_signal_struct() later, but currently I don't see the immediate reason. Rename __cleanup_signal() to free_signal_struct() and unexport it. With the previous changes it does nothing except kmem_cache_free(). Change __exit_signal() to not clear/free ->signal, it will be freed when the last reference to any thread in the thread group goes away. Note: - when the last thead exits signal->tty can point to nowhere, see the next patch. - with or without this patch signal_struct->count should go away, or at least it should be "int nr_threads" for fs/proc. This will be addressed later. Signed-off-by: Oleg Nesterov Cc: Alan Cox Cc: Ingo Molnar Cc: Peter Zijlstra Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index a95a2455cebe..32e309df408c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -527,6 +527,7 @@ struct thread_group_cputimer { * the locking of signal_struct. */ struct signal_struct { + atomic_t sigcnt; atomic_t count; atomic_t live; @@ -2101,7 +2102,6 @@ extern void flush_thread(void); extern void exit_thread(void); extern void exit_files(struct task_struct *); -extern void __cleanup_signal(struct signal_struct *); extern void __cleanup_sighand(struct sighand_struct *); extern void exit_itimers(struct signal_struct *); -- cgit v1.2.2 From b7b8ff6373d4b910af081f76888395e6df53249d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:18 -0700 Subject: signals: kill the awful task_rq_unlock_wait() hack Now that task->signal can't go away we can revert the horrible hack added by ad474caca3e2a0550b7ce0706527ad5ab389a4d4 ("fix for account_group_exec_runtime(), make sure ->signal can't be freed under rq->lock"). And we can do more cleanups sched_stats.h/posix-cpu-timers.c later. Signed-off-by: Oleg Nesterov Cc: Alan Cox Cc: Ingo Molnar Cc: Peter Zijlstra Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 32e309df408c..2d1e1a1228ef 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -268,7 +268,6 @@ extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle_bootup_task(struct task_struct *idle); extern int runqueue_is_locked(int cpu); -extern void task_rq_unlock_wait(struct task_struct *p); extern cpumask_var_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) -- cgit v1.2.2 From a705be6b5e8b05f2ae51536ec709de921960326c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:19 -0700 Subject: kill the obsolete thread_group_cputime_free() helper Kill the empty thread_group_cputime_free() helper. It was needed to free the per-cpu data which we no longer have. Signed-off-by: Oleg Nesterov Cc: Balbir Singh Cc: Roland McGrath Cc: Veaceslav Falico Cc: Stanislaw Gruszka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2d1e1a1228ef..dd597d8013a8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2393,10 +2393,6 @@ static inline void thread_group_cputime_init(struct signal_struct *sig) spin_lock_init(&sig->cputimer.lock); } -static inline void thread_group_cputime_free(struct signal_struct *sig) -{ -} - /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. -- cgit v1.2.2 From 7e49827cc937a742ae02078b483e3eb78f791a2a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:22 -0700 Subject: proc: get_nr_threads() doesn't need ->siglock any longer Now that task->signal can't go away get_nr_threads() doesn't need ->siglock to read signal->count. Also, make it inline, move into sched.h, and convert 2 other proc users of signal->count to use this (now trivial) helper. Henceforth get_nr_threads() is the only valid user of signal->count, we are ready to turn it into "int nr_threads" or, perhaps, kill it. Signed-off-by: Oleg Nesterov Cc: Alexey Dobriyan Cc: David Howells Cc: "Eric W. Biederman" Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index dd597d8013a8..ccd2d1500720 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2147,6 +2147,11 @@ extern bool current_is_single_threaded(void); #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) +static inline int get_nr_threads(struct task_struct *tsk) +{ + return atomic_read(&tsk->signal->count); +} + /* de_thread depends on thread_group_leader not being a pid based check */ #define thread_group_leader(p) (p == p->group_leader) -- cgit v1.2.2 From b3ac022cb9dc5883505a88b159d1b240ad1ef405 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:24 -0700 Subject: proc: turn signal_struct->count into "int nr_threads" No functional changes, just s/atomic_t count/int nr_threads/. With the recent changes this counter has a single user, get_nr_threads() And, none of its callers need the really accurate number of threads, not to mention each caller obviously races with fork/exit. It is only used to report this value to the user-space, except first_tid() uses it to avoid the unnecessary while_each_thread() loop in the unlikely case. It is a bit sad we need a word in struct signal_struct for this, perhaps we can change get_nr_threads() to approximate the number of threads using signal->live and kill ->nr_threads later. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Oleg Nesterov Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index ccd2d1500720..f118809c953f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -527,8 +527,8 @@ struct thread_group_cputimer { */ struct signal_struct { atomic_t sigcnt; - atomic_t count; atomic_t live; + int nr_threads; wait_queue_head_t wait_chldexit; /* for wait4() */ @@ -2149,7 +2149,7 @@ extern bool current_is_single_threaded(void); static inline int get_nr_threads(struct task_struct *tsk) { - return atomic_read(&tsk->signal->count); + return tsk->signal->nr_threads; } /* de_thread depends on thread_group_leader not being a pid based check */ -- cgit v1.2.2