aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2010-12-09 08:15:34 -0500
committerIngo Molnar <mingo@elte.hu>2010-12-16 05:17:47 -0500
commit8e92c20183ed0579d94501311b81c42b65cb2129 (patch)
tree3a00ccaf8222c994fd7071129dc357321c10c9df /kernel/sched.c
parentfe44d62122829959e960bc699318d58966922a69 (diff)
sched: Fix the irqtime code for 32bit
Since the irqtime accounting is using non-atomic u64 and can be read from remote cpus (writes are strictly cpu local, reads are not) we have to deal with observing partial updates. When we do observe partial updates the clock movement (in particular, ->clock_task movement) will go funny (in either direction), a subsequent clock update (observing the full update) will make it go funny in the oposite direction. Since we rely on these clocks to be strictly monotonic we cannot suffer backwards motion. One possible solution would be to simply ignore all backwards deltas, but that will lead to accounting artefacts, most notable: clock_task + irq_time != clock, this inaccuracy would end up in user visible stats. Therefore serialize the reads using a seqcount. Reviewed-by: Venkatesh Pallipadi <venki@google.com> Reported-by: Mikael Pettersson <mikpe@it.uu.se> Tested-by: Mikael Pettersson <mikpe@it.uu.se> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <1292242434.6803.200.camel@twins> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c51
1 files changed, 45 insertions, 6 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 79b557c63381..456c99054160 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1920,10 +1920,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1920 * They are read and saved off onto struct rq in update_rq_clock(). 1920 * They are read and saved off onto struct rq in update_rq_clock().
1921 * This may result in other CPU reading this CPU's irq time and can 1921 * This may result in other CPU reading this CPU's irq time and can
1922 * race with irq/account_system_vtime on this CPU. We would either get old 1922 * race with irq/account_system_vtime on this CPU. We would either get old
1923 * or new value (or semi updated value on 32 bit) with a side effect of 1923 * or new value with a side effect of accounting a slice of irq time to wrong
1924 * accounting a slice of irq time to wrong task when irq is in progress 1924 * task when irq is in progress while we read rq->clock. That is a worthy
1925 * while we read rq->clock. That is a worthy compromise in place of having 1925 * compromise in place of having locks on each irq in account_system_time.
1926 * locks on each irq in account_system_time.
1927 */ 1926 */
1928static DEFINE_PER_CPU(u64, cpu_hardirq_time); 1927static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1929static DEFINE_PER_CPU(u64, cpu_softirq_time); 1928static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1941,10 +1940,48 @@ void disable_sched_clock_irqtime(void)
1941 sched_clock_irqtime = 0; 1940 sched_clock_irqtime = 0;
1942} 1941}
1943 1942
1944static inline u64 irq_time_cpu(int cpu) 1943#ifndef CONFIG_64BIT
1944static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1945
1946static inline void irq_time_write_begin(void)
1947{
1948 __this_cpu_inc(irq_time_seq.sequence);
1949 smp_wmb();
1950}
1951
1952static inline void irq_time_write_end(void)
1953{
1954 smp_wmb();
1955 __this_cpu_inc(irq_time_seq.sequence);
1956}
1957
1958static inline u64 irq_time_read(int cpu)
1959{
1960 u64 irq_time;
1961 unsigned seq;
1962
1963 do {
1964 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1965 irq_time = per_cpu(cpu_softirq_time, cpu) +
1966 per_cpu(cpu_hardirq_time, cpu);
1967 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1968
1969 return irq_time;
1970}
1971#else /* CONFIG_64BIT */
1972static inline void irq_time_write_begin(void)
1973{
1974}
1975
1976static inline void irq_time_write_end(void)
1977{
1978}
1979
1980static inline u64 irq_time_read(int cpu)
1945{ 1981{
1946 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); 1982 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1947} 1983}
1984#endif /* CONFIG_64BIT */
1948 1985
1949/* 1986/*
1950 * Called before incrementing preempt_count on {soft,}irq_enter 1987 * Called before incrementing preempt_count on {soft,}irq_enter
@@ -1965,6 +2002,7 @@ void account_system_vtime(struct task_struct *curr)
1965 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); 2002 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1966 __this_cpu_add(irq_start_time, delta); 2003 __this_cpu_add(irq_start_time, delta);
1967 2004
2005 irq_time_write_begin();
1968 /* 2006 /*
1969 * We do not account for softirq time from ksoftirqd here. 2007 * We do not account for softirq time from ksoftirqd here.
1970 * We want to continue accounting softirq time to ksoftirqd thread 2008 * We want to continue accounting softirq time to ksoftirqd thread
@@ -1976,6 +2014,7 @@ void account_system_vtime(struct task_struct *curr)
1976 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 2014 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1977 __this_cpu_add(cpu_softirq_time, delta); 2015 __this_cpu_add(cpu_softirq_time, delta);
1978 2016
2017 irq_time_write_end();
1979 local_irq_restore(flags); 2018 local_irq_restore(flags);
1980} 2019}
1981EXPORT_SYMBOL_GPL(account_system_vtime); 2020EXPORT_SYMBOL_GPL(account_system_vtime);
@@ -1984,7 +2023,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1984{ 2023{
1985 s64 irq_delta; 2024 s64 irq_delta;
1986 2025
1987 irq_delta = irq_time_cpu(cpu_of(rq)) - rq->prev_irq_time; 2026 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1988 2027
1989 /* 2028 /*
1990 * Since irq_time is only updated on {soft,}irq_exit, we might run into 2029 * Since irq_time is only updated on {soft,}irq_exit, we might run into