From ce0e7b28fb75cb003cfc8d0238613aaf1c55e797 Mon Sep 17 00:00:00 2001 From: Ryota Ozaki Date: Sat, 24 Oct 2009 01:20:10 +0900 Subject: sched, cpuacct: Fix niced guest time accounting CPU time of a guest is always accounted in 'user' time without concern for the nice value of its counterpart process although the guest is scheduled under the nice value. This patch fixes the defect and accounts cpu time of a niced guest in 'nice' time as same as a niced process. And also the patch adds 'guest_nice' to cpuacct. The value provides niced guest cpu time which is like 'nice' to 'user'. The original discussions can be found here: http://www.mail-archive.com/kvm@vger.kernel.org/msg23982.html http://www.mail-archive.com/kvm@vger.kernel.org/msg23860.html Signed-off-by: Ryota Ozaki Acked-by: Avi Kivity Cc: Peter Zijlstra LKML-Reference: <1256314810-7897-1-git-send-email-ozaki.ryota@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 348fa8874b52..c059044bc6dc 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -25,6 +25,7 @@ struct cpu_usage_stat { cputime64_t iowait; cputime64_t steal; cputime64_t guest; + cputime64_t guest_nice; }; struct kernel_stat { -- cgit v1.2.2 From 1477b6a7edd9ffa7bba4f9779ce9a76ce92761ed Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 4 Nov 2009 16:14:16 +0900 Subject: sched: Remove unused __schedule() declaration __schedule() had been removed. Signed-off-by: Hiroshi Shimamoto Cc: Peter Zijlstra LKML-Reference: <4AF129C8.3030008@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 75e6e60bf583..f18102c4d0b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -349,7 +349,6 @@ extern signed long schedule_timeout(signed long timeout); extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); -asmlinkage void __schedule(void); asmlinkage void schedule(void); extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); -- cgit v1.2.2 From 2a2bb3142d326bb28b03875cabfc49baaac9a14a Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 4 Nov 2009 16:16:10 +0900 Subject: sched: Remove unused time_sync_thresh declaration time_sync_thresh had been removed. Signed-off-by: Hiroshi Shimamoto Cc: Peter Zijlstra LKML-Reference: <4AF12A3A.5050200@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f18102c4d0b8..754b3deed02b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -171,8 +171,6 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) } #endif -extern unsigned long long time_sync_thresh; - /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). -- cgit v1.2.2 From 9824a2b728b63e7ff586b9fd9293c819be79f0f3 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 4 Nov 2009 16:16:54 +0900 Subject: sched: Remove unused cpu_nr_migrations() cpu_nr_migrations() is not used, remove it. Signed-off-by: Hiroshi Shimamoto Cc: Peter Zijlstra LKML-Reference: <4AF12A66.6020609@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 754b3deed02b..dfc21fb76bf1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -145,7 +145,6 @@ extern unsigned long this_cpu_load(void); extern void calc_global_load(void); -extern u64 cpu_nr_migrations(int cpu); extern unsigned long get_parent_ip(unsigned long addr); -- cgit v1.2.2 From acc3f5d7cabbfd6cec71f0c1f9900621fa2d6ae7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 3 Nov 2009 14:53:40 +1030 Subject: cpumask: Partition_sched_domains takes array of cpumask_var_t Currently partition_sched_domains() takes a 'struct cpumask *doms_new' which is a kmalloc'ed array of cpumask_t. You can't have such an array if 'struct cpumask' is undefined, as we plan for CONFIG_CPUMASK_OFFSTACK=y. So, we make this an array of cpumask_var_t instead: this is the same for the CONFIG_CPUMASK_OFFSTACK=n case, but requires multiple allocations for the CONFIG_CPUMASK_OFFSTACK=y case. Hence we add alloc_sched_domains() and free_sched_domains() functions. Signed-off-by: Rusty Russell Cc: Peter Zijlstra LKML-Reference: <200911031453.40668.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index dfc21fb76bf1..78ba664474f3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1009,9 +1009,13 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd) return to_cpumask(sd->span); } -extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, +extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new); +/* Allocate an array of sched domains, for partition_sched_domains(). */ +cpumask_var_t *alloc_sched_domains(unsigned int ndoms); +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); + /* Test a flag in parent sched domain */ static inline int test_sd_parent(struct sched_domain *sd, int flag) { @@ -1029,7 +1033,7 @@ unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu); struct sched_domain_attr; static inline void -partition_sched_domains(int ndoms_new, struct cpumask *doms_new, +partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { } -- cgit v1.2.2 From d180c5bccec02612256fd8076ff3c1fac3429553 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Nov 2009 14:48:30 +0900 Subject: sched: Introduce task_times() to replace task_{u,s}time() pair Functions task_{u,s}time() are called in pair in almost all cases. However task_stime() is implemented to call task_utime() from its inside, so such paired calls run task_utime() twice. It means we do heavy divisions (div_u64 + do_div) twice to get utime and stime which can be obtained at same time by one set of divisions. This patch introduces a function task_times(*tsk, *utime, *stime) to retrieve utime and stime at once in better, optimized way. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: Stanislaw Gruszka Cc: Spencer Candland Cc: Oleg Nesterov Cc: Balbir Singh Cc: Americo Wang LKML-Reference: <4B0E16AE.906@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 78ba664474f3..fe6ae1516640 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1723,6 +1723,7 @@ static inline void put_task_struct(struct task_struct *t) extern cputime_t task_utime(struct task_struct *p); extern cputime_t task_stime(struct task_struct *p); extern cputime_t task_gtime(struct task_struct *p); +extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); /* * Per process flags -- cgit v1.2.2 From d5b7c78e975302a1bab28263266c39ecb71acad4 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Nov 2009 14:49:05 +0900 Subject: sched: Remove task_{u,s,g}time() Now all task_{u,s}time() pairs are replaced by task_times(). And task_gtime() is too simple to be an inline function. Cleanup them all. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: Stanislaw Gruszka Cc: Spencer Candland Cc: Oleg Nesterov Cc: Balbir Singh Cc: Americo Wang LKML-Reference: <4B0E16D1.70902@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index fe6ae1516640..0395b0f4df3a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1720,9 +1720,6 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } -extern cputime_t task_utime(struct task_struct *p); -extern cputime_t task_stime(struct task_struct *p); -extern cputime_t task_gtime(struct task_struct *p); extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); /* -- cgit v1.2.2 From b7b20df91d43d5e59578b8fc16e895c0c8cbd9b5 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Nov 2009 14:49:27 +0900 Subject: sched, time: Define nsecs_to_jiffies() Use of msecs_to_jiffies() for nsecs_to_cputime() have some problems: - The type of msecs_to_jiffies()'s argument is unsigned int, so it cannot convert msecs greater than UINT_MAX = about 49.7 days. - msecs_to_jiffies() returns MAX_JIFFY_OFFSET if MSB of argument is set, assuming that input was negative value. So it cannot convert msecs greater than INT_MAX = about 24.8 days too. This patch defines a new function nsecs_to_jiffies() that can deal greater values, and that can deal all incoming values as unsigned. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: Stanislaw Gruszka Cc: Spencer Candland Cc: Oleg Nesterov Cc: Balbir Singh Cc: Amrico Wang Cc: Thomas Gleixner Cc: John Stultz LKML-Reference: <4B0E16E7.5070307@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/jiffies.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 1a9cf78bfce5..6811f4bfc6e7 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -307,6 +307,7 @@ extern clock_t jiffies_to_clock_t(long x); extern unsigned long clock_t_to_jiffies(unsigned long x); extern u64 jiffies_64_to_clock_t(u64 x); extern u64 nsec_to_clock_t(u64 x); +extern unsigned long nsecs_to_jiffies(u64 n); #define TIMESTAMP_SIZE 30 -- cgit v1.2.2 From 8592e6486a177a02f048567cb928bc3a1f9a86c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Dec 2009 12:56:46 +0900 Subject: sched: Revert 498657a478c60be092208422fefa9c7b248729c2 498657a478c60be092208422fefa9c7b248729c2 incorrectly assumed that preempt wasn't disabled around context_switch() and thus was fixing imaginary problem. It also broke KVM because it depended on ->sched_in() to be called with irq enabled so that it can do smp calls from there. Revert the incorrect commit and add comment describing different contexts under with the two callbacks are invoked. Avi: spotted transposed in/out in the added comment. Signed-off-by: Tejun Heo Acked-by: Avi Kivity Cc: peterz@infradead.org Cc: efault@gmx.de Cc: rusty@rustcorp.com.au LKML-Reference: <1259726212-30259-2-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 72b1a10a59b6..2e681d9555bd 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -105,6 +105,11 @@ struct preempt_notifier; * @sched_out: we've just been preempted * notifier: struct preempt_notifier for the task being preempted * next: the task that's kicking us out + * + * Please note that sched_in and out are called under different + * contexts. sched_out is called with rq lock held and irq disabled + * while sched_in is called without rq lock and irq enabled. This + * difference is intentional and depended upon by its users. */ struct preempt_ops { void (*sched_in)(struct preempt_notifier *notifier, int cpu); -- cgit v1.2.2 From d99ca3b977fc5a93141304f571475c2af9e6c1c5 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 2 Dec 2009 17:26:47 +0900 Subject: sched, cputime: Cleanups related to task_times() - Remove if({u,s}t)s because no one call it with NULL now. - Use cputime_{add,sub}(). - Add ifndef-endif for prev_{u,s}time since they are used only when !VIRT_CPU_ACCOUNTING. Signed-off-by: Hidetoshi Seto Cc: Peter Zijlstra Cc: Spencer Candland Cc: Americo Wang Cc: Oleg Nesterov Cc: Balbir Singh Cc: Stanislaw Gruszka LKML-Reference: <4B1624C7.7040302@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0395b0f4df3a..dff85e58264e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1331,7 +1331,9 @@ struct task_struct { cputime_t utime, stime, utimescaled, stimescaled; cputime_t gtime; +#ifndef CONFIG_VIRT_CPU_ACCOUNTING cputime_t prev_utime, prev_stime; +#endif unsigned long nvcsw, nivcsw; /* context switch counts */ struct timespec start_time; /* monotonic time */ struct timespec real_start_time; /* boot based time */ -- cgit v1.2.2 From 0cf55e1ec08bb5a22e068309e2d8ba1180ab4239 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 2 Dec 2009 17:28:07 +0900 Subject: sched, cputime: Introduce thread_group_times() This is a real fix for problem of utime/stime values decreasing described in the thread: http://lkml.org/lkml/2009/11/3/522 Now cputime is accounted in the following way: - {u,s}time in task_struct are increased every time when the thread is interrupted by a tick (timer interrupt). - When a thread exits, its {u,s}time are added to signal->{u,s}time, after adjusted by task_times(). - When all threads in a thread_group exits, accumulated {u,s}time (and also c{u,s}time) in signal struct are added to c{u,s}time in signal struct of the group's parent. So {u,s}time in task struct are "raw" tick count, while {u,s}time and c{u,s}time in signal struct are "adjusted" values. And accounted values are used by: - task_times(), to get cputime of a thread: This function returns adjusted values that originates from raw {u,s}time and scaled by sum_exec_runtime that accounted by CFS. - thread_group_cputime(), to get cputime of a thread group: This function returns sum of all {u,s}time of living threads in the group, plus {u,s}time in the signal struct that is sum of adjusted cputimes of all exited threads belonged to the group. The problem is the return value of thread_group_cputime(), because it is mixed sum of "raw" value and "adjusted" value: group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time) This misbehavior can break {u,s}time monotonicity. Assume that if there is a thread that have raw values greater than adjusted values (e.g. interrupted by 1000Hz ticks 50 times but only runs 45ms) and if it exits, cputime will decrease (e.g. -5ms). To fix this, we could do: group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time) But task_times() contains hard divisions, so applying it for every thread should be avoided. This patch fixes the above problem in the following way: - Modify thread's exit (= __exit_signal()) not to use task_times(). It means {u,s}time in signal struct accumulates raw values instead of adjusted values. As the result it makes thread_group_cputime() to return pure sum of "raw" values. - Introduce a new function thread_group_times(*task, *utime, *stime) that converts "raw" values of thread_group_cputime() to "adjusted" values, in same calculation procedure as task_times(). - Modify group's exit (= wait_task_zombie()) to use this introduced thread_group_times(). It make c{u,s}time in signal struct to have adjusted values like before this patch. - Replace some thread_group_cputime() by thread_group_times(). This replacements are only applied where conveys the "adjusted" cputime to users, and where already uses task_times() near by it. (i.e. sys_times(), getrusage(), and /proc//stat.) This patch have a positive side effect: - Before this patch, if a group contains many short-life threads (e.g. runs 0.9ms and not interrupted by ticks), the group's cputime could be invisible since thread's cputime was accumulated after adjusted: imagine adjustment function as adj(ticks, runtime), {adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0. After this patch it will not happen because the adjustment is applied after accumulated. v2: - remove if()s, put new variables into signal_struct. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: Spencer Candland Cc: Americo Wang Cc: Oleg Nesterov Cc: Balbir Singh Cc: Stanislaw Gruszka LKML-Reference: <4B162517.8040909@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index dff85e58264e..34238bd10ebf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -624,6 +624,9 @@ struct signal_struct { cputime_t utime, stime, cutime, cstime; cputime_t gtime; cputime_t cgtime; +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + cputime_t prev_utime, prev_stime; +#endif unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; @@ -1723,6 +1726,7 @@ static inline void put_task_struct(struct task_struct *t) } extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); +extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); /* * Per process flags -- cgit v1.2.2