From 0793a61d4df8daeac6492dbf8d2f3e5713caae5e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Dec 2008 20:12:29 +0100 Subject: performance counters: core code Implement the core kernel bits of Performance Counters subsystem. The Linux Performance Counter subsystem provides an abstraction of performance counter hardware capabilities. It provides per task and per CPU counters, and it provides event capabilities on top of those. Performance counters are accessed via special file descriptors. There's one file descriptor per virtual counter used. The special file descriptor is opened via the perf_counter_open() system call: int perf_counter_open(u32 hw_event_type, u32 hw_event_period, u32 record_type, pid_t pid, int cpu); The syscall returns the new fd. The fd can be used via the normal VFS system calls: read() can be used to read the counter, fcntl() can be used to set the blocking mode, etc. Multiple counters can be kept open at a time, and the counters can be poll()ed. See more details in Documentation/perf-counters.txt. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 55e30d114477..4c530278391b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -71,6 +71,7 @@ struct sched_param { #include #include #include +#include #include #include #include @@ -1326,6 +1327,7 @@ struct task_struct { struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif + struct perf_counter_context perf_counter_ctx; #ifdef CONFIG_NUMA struct mempolicy *mempolicy; short il_next; @@ -2285,6 +2287,13 @@ static inline void inc_syscw(struct task_struct *tsk) #define TASK_SIZE_OF(tsk) TASK_SIZE #endif +/* + * Call the function if the target task is executing on a CPU right now: + */ +extern void task_oncpu_function_call(struct task_struct *p, + void (*func) (void *info), void *info); + + #ifdef CONFIG_MM_OWNER extern void mm_update_next_owner(struct mm_struct *mm); extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); -- cgit v1.2.2 From 6c594c21fcb02c662f11c97be4d7d2b73060a205 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 12:34:15 +0100 Subject: perfcounters: add task migrations counter Impact: add new feature, new sw counter Add a counter that counts the number of cross-CPU migrations a task is suffering. Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4c530278391b..2e15be8fc792 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1014,6 +1014,8 @@ struct sched_entity { u64 last_wakeup; u64 avg_overlap; + u64 nr_migrations; + #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; @@ -1029,7 +1031,6 @@ struct sched_entity { u64 exec_max; u64 slice_max; - u64 nr_migrations; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; -- cgit v1.2.2 From 23a185ca8abbeef64b6ffc33059b1d630e43ec10 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 9 Feb 2009 22:42:47 +1100 Subject: perf_counters: make software counters work as per-cpu counters Impact: kernel crash fix Yanmin Zhang reported that using a PERF_COUNT_TASK_CLOCK software counter as a per-cpu counter would reliably crash the system, because it calls __task_delta_exec with a null pointer. The page fault, context switch and cpu migration counters also won't function correctly as per-cpu counters since they reference the current task. This fixes the problem by redirecting the task_clock counter to the cpu_clock counter when used as a per-cpu counter, and by implementing per-cpu page fault, context switch and cpu migration counters. Along the way, this: - Initializes counter->ctx earlier, in perf_counter_alloc, so that sw_perf_counter_init can use it - Adds code to kernel/sched.c to count task migrations into each cpu, in rq->nr_migrations_in - Exports the per-cpu context switch and task migration counts via new functions added to kernel/sched.c - Makes sure that if sw_perf_counter_init fails, we don't try to initialize the counter as a hardware counter. Since the user has passed a negative, non-raw event type, they clearly don't intend for it to be interpreted as a hardware event. Reported-by: "Zhang Yanmin" Signed-off-by: Paul Mackerras Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b85b10abf770..1e5f70062a9c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -137,6 +137,8 @@ extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); +extern u64 cpu_nr_switches(int cpu); +extern u64 cpu_nr_migrations(int cpu); struct seq_file; struct cfs_rq; -- cgit v1.2.2 From 4a0deca657f3dbb8a707b5dc8f173beec01e7ed2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:12 +0100 Subject: perf_counter: generic context switch event Impact: cleanup Use the generic software events for context switches. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.283522645@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 75b2fc5306d8..7ed41f7c5ace 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -138,7 +138,6 @@ extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); -extern u64 cpu_nr_switches(int cpu); extern u64 cpu_nr_migrations(int cpu); extern unsigned long get_parent_ip(unsigned long addr); -- cgit v1.2.2 From a26b89f05d194413c7238e0bea071054f6b5d3c8 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:34 +0200 Subject: sched, hw-branch-tracer: add wait_task_context_switch() function to sched.h Add a function to wait until some other task has been switched out at least once. This differs from wait_task_inactive() subtly, in that the latter will wait until the task has left the CPU. Signed-off-by: Markus Metzger Cc: markus.t.metzger@gmail.com Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144549.794157000@intel.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b94f3541f67b..a5b9a83065fa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1993,8 +1993,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP +extern void wait_task_context_switch(struct task_struct *p); extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else +static inline void wait_task_context_switch(struct task_struct *p) {} static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) { -- cgit v1.2.2 From e2b371f00a6f529f6362654239bdec8dcd510760 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:35 +0200 Subject: mm, x86, ptrace, bts: defer branch trace stopping When a ptraced task is unlinked, we need to stop branch tracing for that task. Since the unlink is called with interrupts disabled, and we need interrupts enabled to stop branch tracing, we defer the work. Collect all branch tracing related stuff in a branch tracing context. Reviewed-by: Oleg Nesterov Signed-off-by: Markus Metzger Cc: Andrew Morton Cc: Peter Zijlstra Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144550.712401000@intel.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index a5b9a83065fa..52b8cd049c2e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -96,8 +96,8 @@ struct exec_domain; struct futex_pi_state; struct robust_list_head; struct bio; -struct bts_tracer; struct fs_struct; +struct bts_context; /* * List of flags we want to share for kernel threads, @@ -1210,12 +1210,7 @@ struct task_struct { * This is the tracer handle for the ptrace BTS extension. * This field actually belongs to the ptracer task. */ - struct bts_tracer *bts; - /* - * The buffer to hold the BTS data. - */ - void *bts_buffer; - size_t bts_size; + struct bts_context *bts; #endif /* CONFIG_X86_PTRACE_BTS */ /* PID/PID hash table linkage. */ -- cgit v1.2.2 From 0f4814065ff8c24ca8bfd75c9b73502be152c287 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:48 +0200 Subject: x86, ptrace: add bts context unconditionally Add the ptrace bts context field to task_struct unconditionally. Initialize the field directly in copy_process(). Remove all the unneeded functionality used to initialize that field. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144603.292754000@intel.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 52b8cd049c2e..451186a22ef5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1205,13 +1205,11 @@ struct task_struct { struct list_head ptraced; struct list_head ptrace_entry; -#ifdef CONFIG_X86_PTRACE_BTS /* * This is the tracer handle for the ptrace BTS extension. * This field actually belongs to the ptracer task. */ struct bts_context *bts; -#endif /* CONFIG_X86_PTRACE_BTS */ /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; -- cgit v1.2.2 From 05725f7eb4b8acb147c5fc7b91397b1f6bcab00d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 14 Apr 2009 20:17:16 +0200 Subject: rculist: use list_entry_rcu in places where it's appropriate Use previously introduced list_entry_rcu instead of an open-coded list_entry + rcu_dereference combination. Signed-off-by: Jiri Pirko Reviewed-by: Paul E. McKenney Cc: dipankar@in.ibm.com LKML-Reference: <20090414181715.GA3634@psychotron.englab.brq.redhat.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b4c38bc8049c..886df41e7452 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -77,6 +77,7 @@ struct sched_param { #include #include #include +#include #include #include @@ -2010,7 +2011,8 @@ static inline unsigned long wait_task_inactive(struct task_struct *p, } #endif -#define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) +#define next_task(p) \ + list_entry_rcu((p)->tasks.next, struct task_struct, tasks) #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) @@ -2049,8 +2051,8 @@ int same_thread_group(struct task_struct *p1, struct task_struct *p2) static inline struct task_struct *next_thread(const struct task_struct *p) { - return list_entry(rcu_dereference(p->thread_group.next), - struct task_struct, thread_group); + return list_entry_rcu(p->thread_group.next, + struct task_struct, thread_group); } static inline int thread_group_empty(struct task_struct *p) -- cgit v1.2.2 From 261842b7c9099f56de2eb969c8ad65402d68e00e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 16 Apr 2009 21:41:52 -0400 Subject: tracing: add same level recursion detection The tracing infrastructure allows for recursion. That is, an interrupt may interrupt the act of tracing an event, and that interrupt may very well perform its own trace. This is a recursive trace, and is fine to do. The problem arises when there is a bug, and the utility doing the trace calls something that recurses back into the tracer. This recursion is not caused by an external event like an interrupt, but by code that is not expected to recurse. The result could be a lockup. This patch adds a bitmask to the task structure that keeps track of the trace recursion. To find the interrupt depth, the following algorithm is used: level = hardirq_count() + softirq_count() + in_nmi; Here, level will be the depth of interrutps and softirqs, and even handles the nmi. Then the corresponding bit is set in the recursion bitmask. If the bit was already set, we know we had a recursion at the same level and we warn about it and fail the writing to the buffer. After the data has been committed to the buffer, we clear the bit. No atomics are needed. The only races are with interrupts and they reset the bitmask before returning anywy. [ Impact: detect same irq level trace recursion ] Signed-off-by: Steven Rostedt --- include/linux/sched.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b4c38bc8049c..7ede5e490913 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1428,7 +1428,9 @@ struct task_struct { #ifdef CONFIG_TRACING /* state flags for use by tracers */ unsigned long trace; -#endif + /* bitmask of trace recursion */ + unsigned long trace_recursion; +#endif /* CONFIG_TRACING */ }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ -- cgit v1.2.2 From 3bcac0263f0b45e67a64034ebcb69eb9abb742f4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 29 Apr 2009 13:45:05 +0100 Subject: SELinux: Don't flush inherited SIGKILL during execve() Don't flush inherited SIGKILL during execve() in SELinux's post cred commit hook. This isn't really a security problem: if the SIGKILL came before the credentials were changed, then we were right to receive it at the time, and should honour it; if it came after the creds were changed, then we definitely should honour it; and in any case, all that will happen is that the process will be scrapped before it ever returns to userspace. Signed-off-by: David Howells Signed-off-by: Oleg Nesterov Signed-off-by: James Morris --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1d19c025f9d2..d3b787c7aef3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1875,6 +1875,7 @@ extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void); extern void flush_signals(struct task_struct *); +extern void __flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); -- cgit v1.2.2 From 5e751e992f3fb08ba35e1ca8095ec8fbf9eda523 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 8 May 2009 13:55:22 +0100 Subject: CRED: Rename cred_exec_mutex to reflect that it's a guard against ptrace Rename cred_exec_mutex to reflect that it's a guard against foreign intervention on a process's credential state, such as is made by ptrace(). The attachment of a debugger to a process affects execve()'s calculation of the new credential state - _and_ also setprocattr()'s calculation of that state. Signed-off-by: David Howells Signed-off-by: James Morris --- include/linux/sched.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3fa82b353c98..5932ace22400 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1247,7 +1247,9 @@ struct task_struct { * credentials (COW) */ const struct cred *cred; /* effective (overridable) subjective task * credentials (COW) */ - struct mutex cred_exec_mutex; /* execve vs ptrace cred calculation mutex */ + struct mutex cred_guard_mutex; /* guard against foreign influences on + * credential calculations + * (notably. ptrace) */ char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock -- cgit v1.2.2 From cd1bb94b4a0531e8211a3774f17de831f8285f76 Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj Date: Thu, 16 Apr 2009 12:15:34 +0530 Subject: timers: /proc/sys sysctl hook to enable timer migration * Arun R Bharadwaj [2009-04-16 12:11:36]: This patch creates the /proc/sys sysctl interface at /proc/sys/kernel/timer_migration Timer migration is enabled by default. To disable timer migration, when CONFIG_SCHED_DEBUG = y, echo 0 > /proc/sys/kernel/timer_migration Signed-off-by: Arun R Bharadwaj Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b4c38bc8049c..618504010400 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1766,6 +1766,7 @@ extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; +extern unsigned int sysctl_timer_migration; int sched_nr_latency_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, -- cgit v1.2.2 From eea08f32adb3f97553d49a4f79a119833036000a Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj Date: Thu, 16 Apr 2009 12:16:41 +0530 Subject: timers: Logic to move non pinned timers * Arun R Bharadwaj [2009-04-16 12:11:36]: This patch migrates all non pinned timers and hrtimers to the current idle load balancer, from all the idle CPUs. Timers firing on busy CPUs are not migrated. While migrating hrtimers, care should be taken to check if migrating a hrtimer would result in a latency or not. So we compare the expiry of the hrtimer with the next timer interrupt on the target cpu and migrate the hrtimer only if it expires *after* the next interrupt on the target cpu. So, added a clockevents_get_next_event() helper function to return the next_event on the target cpu's clock_event_device. [ tglx: cleanups and simplifications ] Signed-off-by: Arun R Bharadwaj Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 618504010400..311dec123974 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -257,6 +257,7 @@ extern void task_rq_unlock_wait(struct task_struct *p); extern cpumask_var_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern int select_nohz_load_balancer(int cpu); +extern int get_nohz_load_balancer(void); #else static inline int select_nohz_load_balancer(int cpu) { @@ -1772,6 +1773,17 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos); #endif +#ifdef CONFIG_SCHED_DEBUG +static inline unsigned int get_sysctl_timer_migration(void) +{ + return sysctl_timer_migration; +} +#else +static inline unsigned int get_sysctl_timer_migration(void) +{ + return 1; +} +#endif extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; -- cgit v1.2.2 From 789f90fcf6b0b54e655740e9396c954378542c79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:19:27 +0200 Subject: perf_counter: per user mlock gift Instead of a per-process mlock gift for perf-counters, use a per-user gift so that there is less of a DoS potential. [ Impact: allow less worst-case unprivileged memory consumption ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090515132018.496182835@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index d1857580a132..ff59d1231519 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -674,6 +674,10 @@ struct user_struct { struct work_struct work; #endif #endif + +#ifdef CONFIG_PERF_COUNTERS + atomic_long_t locked_vm; +#endif }; extern int uids_sysfs_init(void); -- cgit v1.2.2 From dce48a84adf1806676319f6f480e30a6daa012f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 11 Apr 2009 10:43:41 +0200 Subject: sched, timers: move calc_load() to scheduler Dimitri Sivanich noticed that xtime_lock is held write locked across calc_load() which iterates over all online CPUs. That can cause long latencies for xtime_lock readers on large SMP systems. The load average calculation is an rough estimate anyway so there is no real need to protect the readers vs. the update. It's not a problem when the avenrun array is updated while a reader copies the values. Instead of iterating over all online CPUs let the scheduler_tick code update the number of active tasks shortly before the avenrun update happens. The avenrun update itself is handled by the CPU which calls do_timer(). [ Impact: reduce xtime_lock write locked section ] Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b4c38bc8049c..6eb4892efe45 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -135,8 +135,8 @@ DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); -extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); +extern void calc_global_load(void); extern unsigned long get_parent_ip(unsigned long addr); -- cgit v1.2.2 From 2d02494f5a90f2e4b3c4c6acc85ec94674cdc431 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 2 May 2009 20:08:52 +0200 Subject: sched, timers: cleanup avenrun users avenrun is an rough estimate so we don't have to worry about consistency of the three avenrun values. Remove the xtime lock dependency and provide a function to scale the values. Cleanup the users. [ Impact: cleanup ] Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6eb4892efe45..de7b3b217772 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -116,6 +116,7 @@ struct fs_struct; * 11 bit fractions. */ extern unsigned long avenrun[]; /* Load averages */ +extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); #define FSHIFT 11 /* nr of bits of precision */ #define FIXED_1 (1< Date: Wed, 13 May 2009 16:55:10 +0000 Subject: syscall: Implement a convinience function restart_syscall Currently when we have a signal pending we have the functionality to restart that the current system call. There are other cases such as nasty lock ordering issues where it makes sense to have a simple fix that uses try lock and restarts the system call. Buying time to figure out how to rework the locking strategy. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- include/linux/sched.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b4c38bc8049c..d853f6bb0baf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2178,6 +2178,12 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } +static inline int restart_syscall(void) +{ + set_tsk_thread_flag(current, TIF_SIGPENDING); + return -ERESTARTNOINTR; +} + static inline int signal_pending(struct task_struct *p) { return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); -- cgit v1.2.2 From 4200efd9acda4accf24640f1e77d24fdcdb524df Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 May 2009 09:22:19 +0200 Subject: sched: properly define the sched_group::cpumask and sched_domain::span fields Properly document the variable-size structure tricks we are doing wrt. struct sched_group and sched_domain, and use the field[0] GCC extension instead of defining a vla array. Dont use unions for this, as pointed out by Linus. [ Impact: cleanup, un-confuse Sparse and LLVM ] Reported-by: Jeff Garzik Acked-by: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index de7b3b217772..dbb1043e8656 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -839,7 +839,17 @@ struct sched_group { */ u32 reciprocal_cpu_power; - unsigned long cpumask[]; + /* + * The CPUs this group covers. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + * + * It is also be embedded into static data structures at build + * time. (See 'struct static_sched_group' in kernel/sched.c) + */ + unsigned long cpumask[0]; }; static inline struct cpumask *sched_group_cpus(struct sched_group *sg) @@ -925,8 +935,17 @@ struct sched_domain { char *name; #endif - /* span of all CPUs in this domain */ - unsigned long span[]; + /* + * Span of all CPUs in this domain. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + * + * It is also be embedded into static data structures at build + * time. (See 'struct static_sched_domain' in kernel/sched.c) + */ + unsigned long span[0]; }; static inline struct cpumask *sched_domain_span(struct sched_domain *sd) -- cgit v1.2.2 From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 May 2009 14:17:31 +1000 Subject: perf_counter: Dynamically allocate tasks' perf_counter_context struct This replaces the struct perf_counter_context in the task_struct with a pointer to a dynamically allocated perf_counter_context struct. The main reason for doing is this is to allow us to transfer a perf_counter_context from one task to another when we do lazy PMU switching in a later patch. This has a few side-benefits: the task_struct becomes a little smaller, we save some memory because only tasks that have perf_counters attached get a perf_counter_context allocated for them, and we can remove the inclusion of in sched.h, meaning that we don't end up recompiling nearly everything whenever perf_counter.h changes. The perf_counter_context structures are reference-counted and freed when the last reference is dropped. A context can have references from its task and the counters on its task. Counters can outlive the task so it is possible that a context will be freed well after its task has exited. Contexts are allocated on fork if the parent had a context, or otherwise the first time that a per-task counter is created on a task. In the latter case, we set the context pointer in the task struct locklessly using an atomic compare-and-exchange operation in case we raced with some other task in creating a context for the subject task. This also removes the task pointer from the perf_counter struct. The task pointer was not used anywhere and would make it harder to move a context from one task to another. Anything that needed to know which task a counter was attached to was already using counter->ctx->task. The __perf_counter_init_context function moves up in perf_counter.c so that it can be called from find_get_context, and now initializes the refcount, but is otherwise unchanged. We were potentially calling list_del_counter twice: once from __perf_counter_exit_task when the task exits and once from __perf_counter_remove_from_context when the counter's fd gets closed. This adds a check in list_del_counter so it doesn't do anything if the counter has already been removed from the lists. Since perf_counter_task_sched_in doesn't do anything if the task doesn't have a context, and leaves cpuctx->task_ctx = NULL, this adds code to __perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in the case where the current task adds the first counter to itself and thus creates a context for itself. This also adds similar code to __perf_counter_enable to handle a similar situation which can arise when the counters have been disabled using prctl; that also leaves cpuctx->task_ctx = NULL. [ Impact: refactor counter context management to prepare for new feature ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index ff59d1231519..9714d450f417 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -71,7 +71,6 @@ struct sched_param { #include #include #include -#include #include #include #include @@ -99,6 +98,7 @@ struct robust_list_head; struct bio; struct bts_tracer; struct fs_struct; +struct perf_counter_context; /* * List of flags we want to share for kernel threads, @@ -1387,7 +1387,9 @@ struct task_struct { struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif - struct perf_counter_context perf_counter_ctx; +#ifdef CONFIG_PERF_COUNTERS + struct perf_counter_context *perf_counter_ctxp; +#endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; short il_next; -- cgit v1.2.2 From 082ff5a2767a0679ee543f14883adbafb631ffbe Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:29:00 +0200 Subject: perf_counter: Change pctrl() behaviour Instead of en/dis-abling all counters acting on a particular task, en/dis- able all counters we created. [ v2: fix crash on first counter enable ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.916937244@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9714d450f417..bc9326dcdde1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1389,6 +1389,8 @@ struct task_struct { #endif #ifdef CONFIG_PERF_COUNTERS struct perf_counter_context *perf_counter_ctxp; + struct mutex perf_counter_mutex; + struct list_head perf_counter_list; #endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; -- cgit v1.2.2 From 3959214f971417f4162926ac52ad4cd042958caa Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Tue, 24 Mar 2009 15:43:30 +0100 Subject: sched: delayed cleanup of user_struct During bootup performance tracing we see repeated occurrences of /sys/kernel/uid/* events for the same uid, leading to a, in this case, rather pointless userspace processing for the same uid over and over. This is usually caused by tools which change their uid to "nobody", to run without privileges to read data supplied by untrusted users. This change delays the execution of the (already existing) scheduled work, to cleanup the uid after one second, so the allocated and announced uid can possibly be re-used by another process. This is the current behavior, where almost every invocation of a binary, which changes the uid, creates two events: $ read START < /sys/kernel/uevent_seqnum; \ for i in `seq 100`; do su --shell=/bin/true bin; done; \ read END < /sys/kernel/uevent_seqnum; \ echo $(($END - $START)) 178 With the delayed cleanup, we get only two events, and userspace finishes a bit faster too: $ read START < /sys/kernel/uevent_seqnum; \ for i in `seq 100`; do su --shell=/bin/true bin; done; \ read END < /sys/kernel/uevent_seqnum; \ echo $(($END - $START)) 1 Acked-by: Dhaval Giani Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index c900aa530070..7531b1c28201 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -674,7 +674,7 @@ struct user_struct { struct task_group *tg; #ifdef CONFIG_SYSFS struct kobject kobj; - struct work_struct work; + struct delayed_work work; #endif #endif -- cgit v1.2.2 From 58568d2a8215cb6f55caf2332017d7bdff954e1c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 16 Jun 2009 15:31:49 -0700 Subject: cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Paul Menage Cc: Nick Piggin Cc: Yasunori Goto Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index c900aa530070..1048bf50540a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1318,7 +1318,8 @@ struct task_struct { /* Thread group tracking */ u32 parent_exec_id; u32 self_exec_id; -/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ +/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, + * mempolicy */ spinlock_t alloc_lock; #ifdef CONFIG_GENERIC_HARDIRQS @@ -1386,8 +1387,7 @@ struct task_struct { cputime_t acct_timexpd; /* stime + utime since last update */ #endif #ifdef CONFIG_CPUSETS - nodemask_t mems_allowed; - int cpuset_mems_generation; + nodemask_t mems_allowed; /* Protected by alloc_lock */ int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CGROUPS @@ -1410,7 +1410,7 @@ struct task_struct { struct list_head perf_counter_list; #endif #ifdef CONFIG_NUMA - struct mempolicy *mempolicy; + struct mempolicy *mempolicy; /* Protected by alloc_lock */ short il_next; #endif atomic_t fs_excl; /* holding fs exclusive resources */ -- cgit v1.2.2 From 2ff05b2b4eac2e63d345fc731ea151a060247f53 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 16 Jun 2009 15:32:56 -0700 Subject: oom: move oom_adj value from task_struct to mm_struct The per-task oom_adj value is a characteristic of its mm more than the task itself since it's not possible to oom kill any thread that shares the mm. If a task were to be killed while attached to an mm that could not be freed because another thread were set to OOM_DISABLE, it would have needlessly been terminated since there is no potential for future memory freeing. This patch moves oomkilladj (now more appropriately named oom_adj) from struct task_struct to struct mm_struct. This requires task_lock() on a task to check its oom_adj value to protect against exec, but it's already necessary to take the lock when dereferencing the mm to find the total VM size for the badness heuristic. This fixes a livelock if the oom killer chooses a task and another thread sharing the same memory has an oom_adj value of OOM_DISABLE. This occurs because oom_kill_task() repeatedly returns 1 and refuses to kill the chosen task while select_bad_process() will repeatedly choose the same task during the next retry. Taking task_lock() in select_bad_process() to check for OOM_DISABLE and in oom_kill_task() to check for threads sharing the same memory will be removed in the next patch in this series where it will no longer be necessary. Writing to /proc/pid/oom_adj for a kthread will now return -EINVAL since these threads are immune from oom killing already. They simply report an oom_adj value of OOM_DISABLE. Cc: Nick Piggin Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1048bf50540a..1bc6fae0c135 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1178,7 +1178,6 @@ struct task_struct { * a short time */ unsigned char fpu_counter; - s8 oomkilladj; /* OOM kill score adjustment (bit shift). */ #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif -- cgit v1.2.2 From 20ebcdda78a282d1d5266887ddf8a2d670182576 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 17 Jun 2009 16:27:16 -0700 Subject: memcg: remove unneeded forward declaration from sched.h This forward declaration seems pointless. Signed-off-by: Li Zefan Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 02042e7f2196..d0342101756a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -92,7 +92,6 @@ struct sched_param { #include -struct mem_cgroup; struct exec_domain; struct futex_pi_state; struct robust_list_head; -- cgit v1.2.2 From 17f98dcf6010a1cfd25d179fd0ce77d3dc2685c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 Jun 2009 16:27:51 -0700 Subject: pids: clean up find_task_by_pid variants find_task_by_pid_type_ns is only used to implement find_task_by_vpid and find_task_by_pid_ns, but both of them pass PIDTYPE_PID as first argument. So just fold find_task_by_pid_type_ns into find_task_by_pid_ns and use find_task_by_pid_ns to implement find_task_by_vpid. While we're at it also remove the exports for find_task_by_pid_ns and find_task_by_vpid - we don't have any modular callers left as the only modular caller of he old pre pid namespace find_task_by_pid (gfs2) was switched to pid_task which operates on a struct pid pointer instead of a pid_t. Given the confusion about pid_t values vs namespace that's generally the better option anyway and I think we're better of restricting modules to do it that way. Signed-off-by: Christoph Hellwig Cc: Pavel Emelyanov Cc: "Eric W. Biederman" Cc: Ingo Molnar Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index d0342101756a..4d0754269884 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1878,9 +1878,6 @@ extern struct pid_namespace init_pid_ns; /* * find a task by one of its numerical ids * - * find_task_by_pid_type_ns(): - * it is the most generic call - it finds a task by all id, - * type and namespace specified * find_task_by_pid_ns(): * finds a task by its pid in the specified namespace * find_task_by_vpid(): @@ -1889,9 +1886,6 @@ extern struct pid_namespace init_pid_ns; * see also find_vpid() etc in include/linux/pid.h */ -extern struct task_struct *find_task_by_pid_type_ns(int type, int pid, - struct pid_namespace *ns); - extern struct task_struct *find_task_by_vpid(pid_t nr); extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); -- cgit v1.2.2