diff options
author | Ingo Molnar <mingo@elte.hu> | 2010-12-22 05:53:20 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-12-22 05:53:23 -0500 |
commit | 6c529a266bdc590a870ee2d2092ff6527eff427b (patch) | |
tree | 7be65fa2578820a1258b5a1e8e063a509a5d6176 /kernel | |
parent | 7639dae0ca11038286bbbcda05f2bef601c1eb8d (diff) | |
parent | 90a8a73c06cc32b609a880d48449d7083327e11a (diff) |
Merge commit 'v2.6.37-rc7' into perf/core
Merge reason: Pick up the latest -rc.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/fork.c | 1 | ||||
-rw-r--r-- | kernel/power/swap.c | 2 | ||||
-rw-r--r-- | kernel/power/user.c | 2 | ||||
-rw-r--r-- | kernel/resource.c | 104 | ||||
-rw-r--r-- | kernel/sched.c | 287 | ||||
-rw-r--r-- | kernel/timer.c | 8 | ||||
-rw-r--r-- | kernel/workqueue.c | 7 |
7 files changed, 260 insertions, 151 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index 3b159c5991b7..5447dc7defa9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -273,6 +273,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
273 | 273 | ||
274 | setup_thread_stack(tsk, orig); | 274 | setup_thread_stack(tsk, orig); |
275 | clear_user_return_notifier(tsk); | 275 | clear_user_return_notifier(tsk); |
276 | clear_tsk_need_resched(tsk); | ||
276 | stackend = end_of_stack(tsk); | 277 | stackend = end_of_stack(tsk); |
277 | *stackend = STACK_END_MAGIC; /* for overflow detection */ | 278 | *stackend = STACK_END_MAGIC; /* for overflow detection */ |
278 | 279 | ||
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index baf667bb2794..8c7e4832b9be 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -30,7 +30,7 @@ | |||
30 | 30 | ||
31 | #include "power.h" | 31 | #include "power.h" |
32 | 32 | ||
33 | #define HIBERNATE_SIG "LINHIB0001" | 33 | #define HIBERNATE_SIG "S1SUSPEND" |
34 | 34 | ||
35 | /* | 35 | /* |
36 | * The swap map is a data structure used for keeping track of each page | 36 | * The swap map is a data structure used for keeping track of each page |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 1b2ea31e6bd8..c36c3b9e8a84 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -137,7 +137,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) | |||
137 | free_all_swap_pages(data->swap); | 137 | free_all_swap_pages(data->swap); |
138 | if (data->frozen) | 138 | if (data->frozen) |
139 | thaw_processes(); | 139 | thaw_processes(); |
140 | pm_notifier_call_chain(data->mode == O_WRONLY ? | 140 | pm_notifier_call_chain(data->mode == O_RDONLY ? |
141 | PM_POST_HIBERNATION : PM_POST_RESTORE); | 141 | PM_POST_HIBERNATION : PM_POST_RESTORE); |
142 | atomic_inc(&snapshot_device_available); | 142 | atomic_inc(&snapshot_device_available); |
143 | 143 | ||
diff --git a/kernel/resource.c b/kernel/resource.c index 9fad33efd0db..798e2fae2a06 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -40,23 +40,6 @@ EXPORT_SYMBOL(iomem_resource); | |||
40 | 40 | ||
41 | static DEFINE_RWLOCK(resource_lock); | 41 | static DEFINE_RWLOCK(resource_lock); |
42 | 42 | ||
43 | /* | ||
44 | * By default, we allocate free space bottom-up. The architecture can request | ||
45 | * top-down by clearing this flag. The user can override the architecture's | ||
46 | * choice with the "resource_alloc_from_bottom" kernel boot option, but that | ||
47 | * should only be a debugging tool. | ||
48 | */ | ||
49 | int resource_alloc_from_bottom = 1; | ||
50 | |||
51 | static __init int setup_alloc_from_bottom(char *s) | ||
52 | { | ||
53 | printk(KERN_INFO | ||
54 | "resource: allocating from bottom-up; please report a bug\n"); | ||
55 | resource_alloc_from_bottom = 1; | ||
56 | return 0; | ||
57 | } | ||
58 | early_param("resource_alloc_from_bottom", setup_alloc_from_bottom); | ||
59 | |||
60 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | 43 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) |
61 | { | 44 | { |
62 | struct resource *p = v; | 45 | struct resource *p = v; |
@@ -374,6 +357,10 @@ int __weak page_is_ram(unsigned long pfn) | |||
374 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; | 357 | return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; |
375 | } | 358 | } |
376 | 359 | ||
360 | void __weak arch_remove_reservations(struct resource *avail) | ||
361 | { | ||
362 | } | ||
363 | |||
377 | static resource_size_t simple_align_resource(void *data, | 364 | static resource_size_t simple_align_resource(void *data, |
378 | const struct resource *avail, | 365 | const struct resource *avail, |
379 | resource_size_t size, | 366 | resource_size_t size, |
@@ -397,74 +384,7 @@ static bool resource_contains(struct resource *res1, struct resource *res2) | |||
397 | } | 384 | } |
398 | 385 | ||
399 | /* | 386 | /* |
400 | * Find the resource before "child" in the sibling list of "root" children. | ||
401 | */ | ||
402 | static struct resource *find_sibling_prev(struct resource *root, struct resource *child) | ||
403 | { | ||
404 | struct resource *this; | ||
405 | |||
406 | for (this = root->child; this; this = this->sibling) | ||
407 | if (this->sibling == child) | ||
408 | return this; | ||
409 | |||
410 | return NULL; | ||
411 | } | ||
412 | |||
413 | /* | ||
414 | * Find empty slot in the resource tree given range and alignment. | 387 | * Find empty slot in the resource tree given range and alignment. |
415 | * This version allocates from the end of the root resource first. | ||
416 | */ | ||
417 | static int find_resource_from_top(struct resource *root, struct resource *new, | ||
418 | resource_size_t size, resource_size_t min, | ||
419 | resource_size_t max, resource_size_t align, | ||
420 | resource_size_t (*alignf)(void *, | ||
421 | const struct resource *, | ||
422 | resource_size_t, | ||
423 | resource_size_t), | ||
424 | void *alignf_data) | ||
425 | { | ||
426 | struct resource *this; | ||
427 | struct resource tmp, avail, alloc; | ||
428 | |||
429 | tmp.start = root->end; | ||
430 | tmp.end = root->end; | ||
431 | |||
432 | this = find_sibling_prev(root, NULL); | ||
433 | for (;;) { | ||
434 | if (this) { | ||
435 | if (this->end < root->end) | ||
436 | tmp.start = this->end + 1; | ||
437 | } else | ||
438 | tmp.start = root->start; | ||
439 | |||
440 | resource_clip(&tmp, min, max); | ||
441 | |||
442 | /* Check for overflow after ALIGN() */ | ||
443 | avail = *new; | ||
444 | avail.start = ALIGN(tmp.start, align); | ||
445 | avail.end = tmp.end; | ||
446 | if (avail.start >= tmp.start) { | ||
447 | alloc.start = alignf(alignf_data, &avail, size, align); | ||
448 | alloc.end = alloc.start + size - 1; | ||
449 | if (resource_contains(&avail, &alloc)) { | ||
450 | new->start = alloc.start; | ||
451 | new->end = alloc.end; | ||
452 | return 0; | ||
453 | } | ||
454 | } | ||
455 | |||
456 | if (!this || this->start == root->start) | ||
457 | break; | ||
458 | |||
459 | tmp.end = this->start - 1; | ||
460 | this = find_sibling_prev(root, this); | ||
461 | } | ||
462 | return -EBUSY; | ||
463 | } | ||
464 | |||
465 | /* | ||
466 | * Find empty slot in the resource tree given range and alignment. | ||
467 | * This version allocates from the beginning of the root resource first. | ||
468 | */ | 388 | */ |
469 | static int find_resource(struct resource *root, struct resource *new, | 389 | static int find_resource(struct resource *root, struct resource *new, |
470 | resource_size_t size, resource_size_t min, | 390 | resource_size_t size, resource_size_t min, |
@@ -478,23 +398,24 @@ static int find_resource(struct resource *root, struct resource *new, | |||
478 | struct resource *this = root->child; | 398 | struct resource *this = root->child; |
479 | struct resource tmp = *new, avail, alloc; | 399 | struct resource tmp = *new, avail, alloc; |
480 | 400 | ||
401 | tmp.flags = new->flags; | ||
481 | tmp.start = root->start; | 402 | tmp.start = root->start; |
482 | /* | 403 | /* |
483 | * Skip past an allocated resource that starts at 0, since the | 404 | * Skip past an allocated resource that starts at 0, since the assignment |
484 | * assignment of this->start - 1 to tmp->end below would cause an | 405 | * of this->start - 1 to tmp->end below would cause an underflow. |
485 | * underflow. | ||
486 | */ | 406 | */ |
487 | if (this && this->start == 0) { | 407 | if (this && this->start == 0) { |
488 | tmp.start = this->end + 1; | 408 | tmp.start = this->end + 1; |
489 | this = this->sibling; | 409 | this = this->sibling; |
490 | } | 410 | } |
491 | for (;;) { | 411 | for(;;) { |
492 | if (this) | 412 | if (this) |
493 | tmp.end = this->start - 1; | 413 | tmp.end = this->start - 1; |
494 | else | 414 | else |
495 | tmp.end = root->end; | 415 | tmp.end = root->end; |
496 | 416 | ||
497 | resource_clip(&tmp, min, max); | 417 | resource_clip(&tmp, min, max); |
418 | arch_remove_reservations(&tmp); | ||
498 | 419 | ||
499 | /* Check for overflow after ALIGN() */ | 420 | /* Check for overflow after ALIGN() */ |
500 | avail = *new; | 421 | avail = *new; |
@@ -509,10 +430,8 @@ static int find_resource(struct resource *root, struct resource *new, | |||
509 | return 0; | 430 | return 0; |
510 | } | 431 | } |
511 | } | 432 | } |
512 | |||
513 | if (!this) | 433 | if (!this) |
514 | break; | 434 | break; |
515 | |||
516 | tmp.start = this->end + 1; | 435 | tmp.start = this->end + 1; |
517 | this = this->sibling; | 436 | this = this->sibling; |
518 | } | 437 | } |
@@ -545,10 +464,7 @@ int allocate_resource(struct resource *root, struct resource *new, | |||
545 | alignf = simple_align_resource; | 464 | alignf = simple_align_resource; |
546 | 465 | ||
547 | write_lock(&resource_lock); | 466 | write_lock(&resource_lock); |
548 | if (resource_alloc_from_bottom) | 467 | err = find_resource(root, new, size, min, max, align, alignf, alignf_data); |
549 | err = find_resource(root, new, size, min, max, align, alignf, alignf_data); | ||
550 | else | ||
551 | err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data); | ||
552 | if (err >= 0 && __request_resource(root, new)) | 468 | if (err >= 0 && __request_resource(root, new)) |
553 | err = -EBUSY; | 469 | err = -EBUSY; |
554 | write_unlock(&resource_lock); | 470 | write_unlock(&resource_lock); |
diff --git a/kernel/sched.c b/kernel/sched.c index 605ab1b24d81..c68cead94dd7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -636,22 +636,18 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
636 | 636 | ||
637 | #endif /* CONFIG_CGROUP_SCHED */ | 637 | #endif /* CONFIG_CGROUP_SCHED */ |
638 | 638 | ||
639 | static u64 irq_time_cpu(int cpu); | 639 | static void update_rq_clock_task(struct rq *rq, s64 delta); |
640 | static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); | ||
641 | 640 | ||
642 | inline void update_rq_clock(struct rq *rq) | 641 | static void update_rq_clock(struct rq *rq) |
643 | { | 642 | { |
644 | if (!rq->skip_clock_update) { | 643 | s64 delta; |
645 | int cpu = cpu_of(rq); | ||
646 | u64 irq_time; | ||
647 | 644 | ||
648 | rq->clock = sched_clock_cpu(cpu); | 645 | if (rq->skip_clock_update) |
649 | irq_time = irq_time_cpu(cpu); | 646 | return; |
650 | if (rq->clock - irq_time > rq->clock_task) | ||
651 | rq->clock_task = rq->clock - irq_time; | ||
652 | 647 | ||
653 | sched_irq_time_avg_update(rq, irq_time); | 648 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; |
654 | } | 649 | rq->clock += delta; |
650 | update_rq_clock_task(rq, delta); | ||
655 | } | 651 | } |
656 | 652 | ||
657 | /* | 653 | /* |
@@ -1924,10 +1920,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1924 | * They are read and saved off onto struct rq in update_rq_clock(). | 1920 | * They are read and saved off onto struct rq in update_rq_clock(). |
1925 | * This may result in other CPU reading this CPU's irq time and can | 1921 | * This may result in other CPU reading this CPU's irq time and can |
1926 | * race with irq/account_system_vtime on this CPU. We would either get old | 1922 | * race with irq/account_system_vtime on this CPU. We would either get old |
1927 | * or new value (or semi updated value on 32 bit) with a side effect of | 1923 | * or new value with a side effect of accounting a slice of irq time to wrong |
1928 | * accounting a slice of irq time to wrong task when irq is in progress | 1924 | * task when irq is in progress while we read rq->clock. That is a worthy |
1929 | * while we read rq->clock. That is a worthy compromise in place of having | 1925 | * compromise in place of having locks on each irq in account_system_time. |
1930 | * locks on each irq in account_system_time. | ||
1931 | */ | 1926 | */ |
1932 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | 1927 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); |
1933 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | 1928 | static DEFINE_PER_CPU(u64, cpu_softirq_time); |
@@ -1945,19 +1940,58 @@ void disable_sched_clock_irqtime(void) | |||
1945 | sched_clock_irqtime = 0; | 1940 | sched_clock_irqtime = 0; |
1946 | } | 1941 | } |
1947 | 1942 | ||
1948 | static u64 irq_time_cpu(int cpu) | 1943 | #ifndef CONFIG_64BIT |
1944 | static DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
1945 | |||
1946 | static inline void irq_time_write_begin(void) | ||
1949 | { | 1947 | { |
1950 | if (!sched_clock_irqtime) | 1948 | __this_cpu_inc(irq_time_seq.sequence); |
1951 | return 0; | 1949 | smp_wmb(); |
1950 | } | ||
1951 | |||
1952 | static inline void irq_time_write_end(void) | ||
1953 | { | ||
1954 | smp_wmb(); | ||
1955 | __this_cpu_inc(irq_time_seq.sequence); | ||
1956 | } | ||
1957 | |||
1958 | static inline u64 irq_time_read(int cpu) | ||
1959 | { | ||
1960 | u64 irq_time; | ||
1961 | unsigned seq; | ||
1952 | 1962 | ||
1963 | do { | ||
1964 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
1965 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
1966 | per_cpu(cpu_hardirq_time, cpu); | ||
1967 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
1968 | |||
1969 | return irq_time; | ||
1970 | } | ||
1971 | #else /* CONFIG_64BIT */ | ||
1972 | static inline void irq_time_write_begin(void) | ||
1973 | { | ||
1974 | } | ||
1975 | |||
1976 | static inline void irq_time_write_end(void) | ||
1977 | { | ||
1978 | } | ||
1979 | |||
1980 | static inline u64 irq_time_read(int cpu) | ||
1981 | { | ||
1953 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | 1982 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); |
1954 | } | 1983 | } |
1984 | #endif /* CONFIG_64BIT */ | ||
1955 | 1985 | ||
1986 | /* | ||
1987 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
1988 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
1989 | */ | ||
1956 | void account_system_vtime(struct task_struct *curr) | 1990 | void account_system_vtime(struct task_struct *curr) |
1957 | { | 1991 | { |
1958 | unsigned long flags; | 1992 | unsigned long flags; |
1993 | s64 delta; | ||
1959 | int cpu; | 1994 | int cpu; |
1960 | u64 now, delta; | ||
1961 | 1995 | ||
1962 | if (!sched_clock_irqtime) | 1996 | if (!sched_clock_irqtime) |
1963 | return; | 1997 | return; |
@@ -1965,9 +1999,10 @@ void account_system_vtime(struct task_struct *curr) | |||
1965 | local_irq_save(flags); | 1999 | local_irq_save(flags); |
1966 | 2000 | ||
1967 | cpu = smp_processor_id(); | 2001 | cpu = smp_processor_id(); |
1968 | now = sched_clock_cpu(cpu); | 2002 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); |
1969 | delta = now - per_cpu(irq_start_time, cpu); | 2003 | __this_cpu_add(irq_start_time, delta); |
1970 | per_cpu(irq_start_time, cpu) = now; | 2004 | |
2005 | irq_time_write_begin(); | ||
1971 | /* | 2006 | /* |
1972 | * We do not account for softirq time from ksoftirqd here. | 2007 | * We do not account for softirq time from ksoftirqd here. |
1973 | * We want to continue accounting softirq time to ksoftirqd thread | 2008 | * We want to continue accounting softirq time to ksoftirqd thread |
@@ -1975,33 +2010,55 @@ void account_system_vtime(struct task_struct *curr) | |||
1975 | * that do not consume any time, but still wants to run. | 2010 | * that do not consume any time, but still wants to run. |
1976 | */ | 2011 | */ |
1977 | if (hardirq_count()) | 2012 | if (hardirq_count()) |
1978 | per_cpu(cpu_hardirq_time, cpu) += delta; | 2013 | __this_cpu_add(cpu_hardirq_time, delta); |
1979 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) | 2014 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) |
1980 | per_cpu(cpu_softirq_time, cpu) += delta; | 2015 | __this_cpu_add(cpu_softirq_time, delta); |
1981 | 2016 | ||
2017 | irq_time_write_end(); | ||
1982 | local_irq_restore(flags); | 2018 | local_irq_restore(flags); |
1983 | } | 2019 | } |
1984 | EXPORT_SYMBOL_GPL(account_system_vtime); | 2020 | EXPORT_SYMBOL_GPL(account_system_vtime); |
1985 | 2021 | ||
1986 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) | 2022 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
1987 | { | 2023 | { |
1988 | if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { | 2024 | s64 irq_delta; |
1989 | u64 delta_irq = curr_irq_time - rq->prev_irq_time; | 2025 | |
1990 | rq->prev_irq_time = curr_irq_time; | 2026 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; |
1991 | sched_rt_avg_update(rq, delta_irq); | 2027 | |
1992 | } | 2028 | /* |
2029 | * Since irq_time is only updated on {soft,}irq_exit, we might run into | ||
2030 | * this case when a previous update_rq_clock() happened inside a | ||
2031 | * {soft,}irq region. | ||
2032 | * | ||
2033 | * When this happens, we stop ->clock_task and only update the | ||
2034 | * prev_irq_time stamp to account for the part that fit, so that a next | ||
2035 | * update will consume the rest. This ensures ->clock_task is | ||
2036 | * monotonic. | ||
2037 | * | ||
2038 | * It does however cause some slight miss-attribution of {soft,}irq | ||
2039 | * time, a more accurate solution would be to update the irq_time using | ||
2040 | * the current rq->clock timestamp, except that would require using | ||
2041 | * atomic ops. | ||
2042 | */ | ||
2043 | if (irq_delta > delta) | ||
2044 | irq_delta = delta; | ||
2045 | |||
2046 | rq->prev_irq_time += irq_delta; | ||
2047 | delta -= irq_delta; | ||
2048 | rq->clock_task += delta; | ||
2049 | |||
2050 | if (irq_delta && sched_feat(NONIRQ_POWER)) | ||
2051 | sched_rt_avg_update(rq, irq_delta); | ||
1993 | } | 2052 | } |
1994 | 2053 | ||
1995 | #else | 2054 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
1996 | 2055 | ||
1997 | static u64 irq_time_cpu(int cpu) | 2056 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
1998 | { | 2057 | { |
1999 | return 0; | 2058 | rq->clock_task += delta; |
2000 | } | 2059 | } |
2001 | 2060 | ||
2002 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } | 2061 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
2003 | |||
2004 | #endif | ||
2005 | 2062 | ||
2006 | #include "sched_idletask.c" | 2063 | #include "sched_idletask.c" |
2007 | #include "sched_fair.c" | 2064 | #include "sched_fair.c" |
@@ -2129,7 +2186,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
2129 | * A queue event has occurred, and we're going to schedule. In | 2186 | * A queue event has occurred, and we're going to schedule. In |
2130 | * this case, we can save a useless back to back clock update. | 2187 | * this case, we can save a useless back to back clock update. |
2131 | */ | 2188 | */ |
2132 | if (test_tsk_need_resched(rq->curr)) | 2189 | if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) |
2133 | rq->skip_clock_update = 1; | 2190 | rq->skip_clock_update = 1; |
2134 | } | 2191 | } |
2135 | 2192 | ||
@@ -3119,6 +3176,15 @@ static long calc_load_fold_active(struct rq *this_rq) | |||
3119 | return delta; | 3176 | return delta; |
3120 | } | 3177 | } |
3121 | 3178 | ||
3179 | static unsigned long | ||
3180 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
3181 | { | ||
3182 | load *= exp; | ||
3183 | load += active * (FIXED_1 - exp); | ||
3184 | load += 1UL << (FSHIFT - 1); | ||
3185 | return load >> FSHIFT; | ||
3186 | } | ||
3187 | |||
3122 | #ifdef CONFIG_NO_HZ | 3188 | #ifdef CONFIG_NO_HZ |
3123 | /* | 3189 | /* |
3124 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 3190 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. |
@@ -3148,6 +3214,128 @@ static long calc_load_fold_idle(void) | |||
3148 | 3214 | ||
3149 | return delta; | 3215 | return delta; |
3150 | } | 3216 | } |
3217 | |||
3218 | /** | ||
3219 | * fixed_power_int - compute: x^n, in O(log n) time | ||
3220 | * | ||
3221 | * @x: base of the power | ||
3222 | * @frac_bits: fractional bits of @x | ||
3223 | * @n: power to raise @x to. | ||
3224 | * | ||
3225 | * By exploiting the relation between the definition of the natural power | ||
3226 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
3227 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
3228 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
3229 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
3230 | * of course trivially computable in O(log_2 n), the length of our binary | ||
3231 | * vector. | ||
3232 | */ | ||
3233 | static unsigned long | ||
3234 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
3235 | { | ||
3236 | unsigned long result = 1UL << frac_bits; | ||
3237 | |||
3238 | if (n) for (;;) { | ||
3239 | if (n & 1) { | ||
3240 | result *= x; | ||
3241 | result += 1UL << (frac_bits - 1); | ||
3242 | result >>= frac_bits; | ||
3243 | } | ||
3244 | n >>= 1; | ||
3245 | if (!n) | ||
3246 | break; | ||
3247 | x *= x; | ||
3248 | x += 1UL << (frac_bits - 1); | ||
3249 | x >>= frac_bits; | ||
3250 | } | ||
3251 | |||
3252 | return result; | ||
3253 | } | ||
3254 | |||
3255 | /* | ||
3256 | * a1 = a0 * e + a * (1 - e) | ||
3257 | * | ||
3258 | * a2 = a1 * e + a * (1 - e) | ||
3259 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
3260 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
3261 | * | ||
3262 | * a3 = a2 * e + a * (1 - e) | ||
3263 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
3264 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
3265 | * | ||
3266 | * ... | ||
3267 | * | ||
3268 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
3269 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
3270 | * = a0 * e^n + a * (1 - e^n) | ||
3271 | * | ||
3272 | * [1] application of the geometric series: | ||
3273 | * | ||
3274 | * n 1 - x^(n+1) | ||
3275 | * S_n := \Sum x^i = ------------- | ||
3276 | * i=0 1 - x | ||
3277 | */ | ||
3278 | static unsigned long | ||
3279 | calc_load_n(unsigned long load, unsigned long exp, | ||
3280 | unsigned long active, unsigned int n) | ||
3281 | { | ||
3282 | |||
3283 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
3284 | } | ||
3285 | |||
3286 | /* | ||
3287 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
3288 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
3289 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
3290 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
3291 | * | ||
3292 | * Once we've updated the global active value, we need to apply the exponential | ||
3293 | * weights adjusted to the number of cycles missed. | ||
3294 | */ | ||
3295 | static void calc_global_nohz(unsigned long ticks) | ||
3296 | { | ||
3297 | long delta, active, n; | ||
3298 | |||
3299 | if (time_before(jiffies, calc_load_update)) | ||
3300 | return; | ||
3301 | |||
3302 | /* | ||
3303 | * If we crossed a calc_load_update boundary, make sure to fold | ||
3304 | * any pending idle changes, the respective CPUs might have | ||
3305 | * missed the tick driven calc_load_account_active() update | ||
3306 | * due to NO_HZ. | ||
3307 | */ | ||
3308 | delta = calc_load_fold_idle(); | ||
3309 | if (delta) | ||
3310 | atomic_long_add(delta, &calc_load_tasks); | ||
3311 | |||
3312 | /* | ||
3313 | * If we were idle for multiple load cycles, apply them. | ||
3314 | */ | ||
3315 | if (ticks >= LOAD_FREQ) { | ||
3316 | n = ticks / LOAD_FREQ; | ||
3317 | |||
3318 | active = atomic_long_read(&calc_load_tasks); | ||
3319 | active = active > 0 ? active * FIXED_1 : 0; | ||
3320 | |||
3321 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
3322 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
3323 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
3324 | |||
3325 | calc_load_update += n * LOAD_FREQ; | ||
3326 | } | ||
3327 | |||
3328 | /* | ||
3329 | * Its possible the remainder of the above division also crosses | ||
3330 | * a LOAD_FREQ period, the regular check in calc_global_load() | ||
3331 | * which comes after this will take care of that. | ||
3332 | * | ||
3333 | * Consider us being 11 ticks before a cycle completion, and us | ||
3334 | * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will | ||
3335 | * age us 4 cycles, and the test in calc_global_load() will | ||
3336 | * pick up the final one. | ||
3337 | */ | ||
3338 | } | ||
3151 | #else | 3339 | #else |
3152 | static void calc_load_account_idle(struct rq *this_rq) | 3340 | static void calc_load_account_idle(struct rq *this_rq) |
3153 | { | 3341 | { |
@@ -3157,6 +3345,10 @@ static inline long calc_load_fold_idle(void) | |||
3157 | { | 3345 | { |
3158 | return 0; | 3346 | return 0; |
3159 | } | 3347 | } |
3348 | |||
3349 | static void calc_global_nohz(unsigned long ticks) | ||
3350 | { | ||
3351 | } | ||
3160 | #endif | 3352 | #endif |
3161 | 3353 | ||
3162 | /** | 3354 | /** |
@@ -3174,24 +3366,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
3174 | loads[2] = (avenrun[2] + offset) << shift; | 3366 | loads[2] = (avenrun[2] + offset) << shift; |
3175 | } | 3367 | } |
3176 | 3368 | ||
3177 | static unsigned long | ||
3178 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
3179 | { | ||
3180 | load *= exp; | ||
3181 | load += active * (FIXED_1 - exp); | ||
3182 | return load >> FSHIFT; | ||
3183 | } | ||
3184 | |||
3185 | /* | 3369 | /* |
3186 | * calc_load - update the avenrun load estimates 10 ticks after the | 3370 | * calc_load - update the avenrun load estimates 10 ticks after the |
3187 | * CPUs have updated calc_load_tasks. | 3371 | * CPUs have updated calc_load_tasks. |
3188 | */ | 3372 | */ |
3189 | void calc_global_load(void) | 3373 | void calc_global_load(unsigned long ticks) |
3190 | { | 3374 | { |
3191 | unsigned long upd = calc_load_update + 10; | ||
3192 | long active; | 3375 | long active; |
3193 | 3376 | ||
3194 | if (time_before(jiffies, upd)) | 3377 | calc_global_nohz(ticks); |
3378 | |||
3379 | if (time_before(jiffies, calc_load_update + 10)) | ||
3195 | return; | 3380 | return; |
3196 | 3381 | ||
3197 | active = atomic_long_read(&calc_load_tasks); | 3382 | active = atomic_long_read(&calc_load_tasks); |
@@ -3845,7 +4030,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) | |||
3845 | { | 4030 | { |
3846 | if (prev->se.on_rq) | 4031 | if (prev->se.on_rq) |
3847 | update_rq_clock(rq); | 4032 | update_rq_clock(rq); |
3848 | rq->skip_clock_update = 0; | ||
3849 | prev->sched_class->put_prev_task(rq, prev); | 4033 | prev->sched_class->put_prev_task(rq, prev); |
3850 | } | 4034 | } |
3851 | 4035 | ||
@@ -3903,7 +4087,6 @@ need_resched_nonpreemptible: | |||
3903 | hrtick_clear(rq); | 4087 | hrtick_clear(rq); |
3904 | 4088 | ||
3905 | raw_spin_lock_irq(&rq->lock); | 4089 | raw_spin_lock_irq(&rq->lock); |
3906 | clear_tsk_need_resched(prev); | ||
3907 | 4090 | ||
3908 | switch_count = &prev->nivcsw; | 4091 | switch_count = &prev->nivcsw; |
3909 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 4092 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
@@ -3935,6 +4118,8 @@ need_resched_nonpreemptible: | |||
3935 | 4118 | ||
3936 | put_prev_task(rq, prev); | 4119 | put_prev_task(rq, prev); |
3937 | next = pick_next_task(rq); | 4120 | next = pick_next_task(rq); |
4121 | clear_tsk_need_resched(prev); | ||
4122 | rq->skip_clock_update = 0; | ||
3938 | 4123 | ||
3939 | if (likely(prev != next)) { | 4124 | if (likely(prev != next)) { |
3940 | sched_info_switch(prev, next); | 4125 | sched_info_switch(prev, next); |
diff --git a/kernel/timer.c b/kernel/timer.c index 68a9ae7679b7..353b9227c2ec 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -1252,6 +1252,12 @@ unsigned long get_next_timer_interrupt(unsigned long now) | |||
1252 | struct tvec_base *base = __get_cpu_var(tvec_bases); | 1252 | struct tvec_base *base = __get_cpu_var(tvec_bases); |
1253 | unsigned long expires; | 1253 | unsigned long expires; |
1254 | 1254 | ||
1255 | /* | ||
1256 | * Pretend that there is no timer pending if the cpu is offline. | ||
1257 | * Possible pending timers will be migrated later to an active cpu. | ||
1258 | */ | ||
1259 | if (cpu_is_offline(smp_processor_id())) | ||
1260 | return now + NEXT_TIMER_MAX_DELTA; | ||
1255 | spin_lock(&base->lock); | 1261 | spin_lock(&base->lock); |
1256 | if (time_before_eq(base->next_timer, base->timer_jiffies)) | 1262 | if (time_before_eq(base->next_timer, base->timer_jiffies)) |
1257 | base->next_timer = __next_timer_interrupt(base); | 1263 | base->next_timer = __next_timer_interrupt(base); |
@@ -1319,7 +1325,7 @@ void do_timer(unsigned long ticks) | |||
1319 | { | 1325 | { |
1320 | jiffies_64 += ticks; | 1326 | jiffies_64 += ticks; |
1321 | update_wall_time(); | 1327 | update_wall_time(); |
1322 | calc_global_load(); | 1328 | calc_global_load(ticks); |
1323 | } | 1329 | } |
1324 | 1330 | ||
1325 | #ifdef __ARCH_WANT_SYS_ALARM | 1331 | #ifdef __ARCH_WANT_SYS_ALARM |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 90db1bd1a978..e785b0f2aea5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -661,7 +661,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) | |||
661 | { | 661 | { |
662 | struct worker *worker = kthread_data(task); | 662 | struct worker *worker = kthread_data(task); |
663 | 663 | ||
664 | if (likely(!(worker->flags & WORKER_NOT_RUNNING))) | 664 | if (!(worker->flags & WORKER_NOT_RUNNING)) |
665 | atomic_inc(get_gcwq_nr_running(cpu)); | 665 | atomic_inc(get_gcwq_nr_running(cpu)); |
666 | } | 666 | } |
667 | 667 | ||
@@ -687,7 +687,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, | |||
687 | struct global_cwq *gcwq = get_gcwq(cpu); | 687 | struct global_cwq *gcwq = get_gcwq(cpu); |
688 | atomic_t *nr_running = get_gcwq_nr_running(cpu); | 688 | atomic_t *nr_running = get_gcwq_nr_running(cpu); |
689 | 689 | ||
690 | if (unlikely(worker->flags & WORKER_NOT_RUNNING)) | 690 | if (worker->flags & WORKER_NOT_RUNNING) |
691 | return NULL; | 691 | return NULL; |
692 | 692 | ||
693 | /* this can only happen on the local cpu */ | 693 | /* this can only happen on the local cpu */ |
@@ -3692,7 +3692,8 @@ static int __init init_workqueues(void) | |||
3692 | system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); | 3692 | system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); |
3693 | system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, | 3693 | system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, |
3694 | WQ_UNBOUND_MAX_ACTIVE); | 3694 | WQ_UNBOUND_MAX_ACTIVE); |
3695 | BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq); | 3695 | BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || |
3696 | !system_unbound_wq); | ||
3696 | return 0; | 3697 | return 0; |
3697 | } | 3698 | } |
3698 | early_initcall(init_workqueues); | 3699 | early_initcall(init_workqueues); |