diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-14 22:14:06 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-14 22:14:06 -0400 |
commit | d4e796152a049f6a675f8b6dcf7080a9d80014e5 (patch) | |
tree | 3d7bff1541b4035f7fd06c7259032e616ea6b497 | |
parent | d88bfe1d68735595d57bd071294f664c4f054435 (diff) | |
parent | f9c904b7613b8b4c85b10cd6b33ad41b2843fa9d (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The main changes in this cycle are:
- Make schedstats a runtime tunable (disabled by default) and
optimize it via static keys.
As most distributions enable CONFIG_SCHEDSTATS=y due to its
instrumentation value, this is a nice performance enhancement.
(Mel Gorman)
- Implement 'simple waitqueues' (swait): these are just pure
waitqueues without any of the more complex features of full-blown
waitqueues (callbacks, wake flags, wake keys, etc.). Simple
waitqueues have less memory overhead and are faster.
Use simple waitqueues in the RCU code (in 4 different places) and
for handling KVM vCPU wakeups.
(Peter Zijlstra, Daniel Wagner, Thomas Gleixner, Paul Gortmaker,
Marcelo Tosatti)
- sched/numa enhancements (Rik van Riel)
- NOHZ performance enhancements (Rik van Riel)
- Various sched/deadline enhancements (Steven Rostedt)
- Various fixes (Peter Zijlstra)
- ... and a number of other fixes, cleanups and smaller enhancements"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (29 commits)
sched/cputime: Fix steal_account_process_tick() to always return jiffies
sched/deadline: Remove dl_new from struct sched_dl_entity
Revert "kbuild: Add option to turn incompatible pointer check into error"
sched/deadline: Remove superfluous call to switched_to_dl()
sched/debug: Fix preempt_disable_ip recording for preempt_disable()
sched, time: Switch VIRT_CPU_ACCOUNTING_GEN to jiffy granularity
time, acct: Drop irq save & restore from __acct_update_integrals()
acct, time: Change indentation in __acct_update_integrals()
sched, time: Remove non-power-of-two divides from __acct_update_integrals()
sched/rt: Kick RT bandwidth timer immediately on start up
sched/debug: Add deadline scheduler bandwidth ratio to /proc/sched_debug
sched/debug: Move sched_domain_sysctl to debug.c
sched/debug: Move the /sys/kernel/debug/sched_features file setup into debug.c
sched/rt: Fix PI handling vs. sched_setscheduler()
sched/core: Remove duplicated sched_group_set_shares() prototype
sched/fair: Consolidate nohz CPU load update code
sched/fair: Avoid using decay_load_missed() with a negative value
sched/deadline: Always calculate end of period on sched_yield()
sched/cgroup: Fix cgroup entity load tracking tear-down
rcu: Use simple wait queues where possible in rcutree
...
37 files changed, 1300 insertions, 715 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 000336733a6a..8ae47a7b4923 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -3532,6 +3532,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
3532 | 3532 | ||
3533 | sched_debug [KNL] Enables verbose scheduler debug messages. | 3533 | sched_debug [KNL] Enables verbose scheduler debug messages. |
3534 | 3534 | ||
3535 | schedstats= [KNL,X86] Enable or disable scheduled statistics. | ||
3536 | Allowed values are enable and disable. This feature | ||
3537 | incurs a small amount of overhead in the scheduler | ||
3538 | but is useful for debugging and performance tuning. | ||
3539 | |||
3535 | skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate | 3540 | skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate |
3536 | xtime_lock contention on larger systems, and/or RCU lock | 3541 | xtime_lock contention on larger systems, and/or RCU lock |
3537 | contention on all systems with CONFIG_MAXSMP set. | 3542 | contention on all systems with CONFIG_MAXSMP set. |
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index f886fbb1ad05..f4444c94ff28 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
@@ -773,6 +773,14 @@ rtsig-nr shows the number of RT signals currently queued. | |||
773 | 773 | ||
774 | ============================================================== | 774 | ============================================================== |
775 | 775 | ||
776 | sched_schedstats: | ||
777 | |||
778 | Enables/disables scheduler statistics. Enabling this feature | ||
779 | incurs a small amount of overhead in the scheduler but is | ||
780 | useful for debugging and performance tuning. | ||
781 | |||
782 | ============================================================== | ||
783 | |||
776 | sg-big-buff: | 784 | sg-big-buff: |
777 | 785 | ||
778 | This file shows the size of the generic SCSI (sg) buffer. | 786 | This file shows the size of the generic SCSI (sg) buffer. |
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index dda1959f0dde..08e49c423c24 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c | |||
@@ -506,18 +506,18 @@ static void kvm_arm_resume_guest(struct kvm *kvm) | |||
506 | struct kvm_vcpu *vcpu; | 506 | struct kvm_vcpu *vcpu; |
507 | 507 | ||
508 | kvm_for_each_vcpu(i, vcpu, kvm) { | 508 | kvm_for_each_vcpu(i, vcpu, kvm) { |
509 | wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); | 509 | struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); |
510 | 510 | ||
511 | vcpu->arch.pause = false; | 511 | vcpu->arch.pause = false; |
512 | wake_up_interruptible(wq); | 512 | swake_up(wq); |
513 | } | 513 | } |
514 | } | 514 | } |
515 | 515 | ||
516 | static void vcpu_sleep(struct kvm_vcpu *vcpu) | 516 | static void vcpu_sleep(struct kvm_vcpu *vcpu) |
517 | { | 517 | { |
518 | wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); | 518 | struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); |
519 | 519 | ||
520 | wait_event_interruptible(*wq, ((!vcpu->arch.power_off) && | 520 | swait_event_interruptible(*wq, ((!vcpu->arch.power_off) && |
521 | (!vcpu->arch.pause))); | 521 | (!vcpu->arch.pause))); |
522 | } | 522 | } |
523 | 523 | ||
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c index a9b3b905e661..c2b131527a64 100644 --- a/arch/arm/kvm/psci.c +++ b/arch/arm/kvm/psci.c | |||
@@ -70,7 +70,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) | |||
70 | { | 70 | { |
71 | struct kvm *kvm = source_vcpu->kvm; | 71 | struct kvm *kvm = source_vcpu->kvm; |
72 | struct kvm_vcpu *vcpu = NULL; | 72 | struct kvm_vcpu *vcpu = NULL; |
73 | wait_queue_head_t *wq; | 73 | struct swait_queue_head *wq; |
74 | unsigned long cpu_id; | 74 | unsigned long cpu_id; |
75 | unsigned long context_id; | 75 | unsigned long context_id; |
76 | phys_addr_t target_pc; | 76 | phys_addr_t target_pc; |
@@ -119,7 +119,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) | |||
119 | smp_mb(); /* Make sure the above is visible */ | 119 | smp_mb(); /* Make sure the above is visible */ |
120 | 120 | ||
121 | wq = kvm_arch_vcpu_wq(vcpu); | 121 | wq = kvm_arch_vcpu_wq(vcpu); |
122 | wake_up_interruptible(wq); | 122 | swake_up(wq); |
123 | 123 | ||
124 | return PSCI_RET_SUCCESS; | 124 | return PSCI_RET_SUCCESS; |
125 | } | 125 | } |
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 3110447ab1e9..70ef1a43c114 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c | |||
@@ -445,8 +445,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, | |||
445 | 445 | ||
446 | dvcpu->arch.wait = 0; | 446 | dvcpu->arch.wait = 0; |
447 | 447 | ||
448 | if (waitqueue_active(&dvcpu->wq)) | 448 | if (swait_active(&dvcpu->wq)) |
449 | wake_up_interruptible(&dvcpu->wq); | 449 | swake_up(&dvcpu->wq); |
450 | 450 | ||
451 | return 0; | 451 | return 0; |
452 | } | 452 | } |
@@ -1174,8 +1174,8 @@ static void kvm_mips_comparecount_func(unsigned long data) | |||
1174 | kvm_mips_callbacks->queue_timer_int(vcpu); | 1174 | kvm_mips_callbacks->queue_timer_int(vcpu); |
1175 | 1175 | ||
1176 | vcpu->arch.wait = 0; | 1176 | vcpu->arch.wait = 0; |
1177 | if (waitqueue_active(&vcpu->wq)) | 1177 | if (swait_active(&vcpu->wq)) |
1178 | wake_up_interruptible(&vcpu->wq); | 1178 | swake_up(&vcpu->wq); |
1179 | } | 1179 | } |
1180 | 1180 | ||
1181 | /* low level hrtimer wake routine */ | 1181 | /* low level hrtimer wake routine */ |
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 9d08d8cbed1a..c98afa538b3a 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h | |||
@@ -289,7 +289,7 @@ struct kvmppc_vcore { | |||
289 | struct list_head runnable_threads; | 289 | struct list_head runnable_threads; |
290 | struct list_head preempt_list; | 290 | struct list_head preempt_list; |
291 | spinlock_t lock; | 291 | spinlock_t lock; |
292 | wait_queue_head_t wq; | 292 | struct swait_queue_head wq; |
293 | spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */ | 293 | spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */ |
294 | u64 stolen_tb; | 294 | u64 stolen_tb; |
295 | u64 preempt_tb; | 295 | u64 preempt_tb; |
@@ -629,7 +629,7 @@ struct kvm_vcpu_arch { | |||
629 | u8 prodded; | 629 | u8 prodded; |
630 | u32 last_inst; | 630 | u32 last_inst; |
631 | 631 | ||
632 | wait_queue_head_t *wqp; | 632 | struct swait_queue_head *wqp; |
633 | struct kvmppc_vcore *vcore; | 633 | struct kvmppc_vcore *vcore; |
634 | int ret; | 634 | int ret; |
635 | int trap; | 635 | int trap; |
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index baeddb06811d..f1187bb6dd4d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c | |||
@@ -114,11 +114,11 @@ static bool kvmppc_ipi_thread(int cpu) | |||
114 | static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) | 114 | static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) |
115 | { | 115 | { |
116 | int cpu; | 116 | int cpu; |
117 | wait_queue_head_t *wqp; | 117 | struct swait_queue_head *wqp; |
118 | 118 | ||
119 | wqp = kvm_arch_vcpu_wq(vcpu); | 119 | wqp = kvm_arch_vcpu_wq(vcpu); |
120 | if (waitqueue_active(wqp)) { | 120 | if (swait_active(wqp)) { |
121 | wake_up_interruptible(wqp); | 121 | swake_up(wqp); |
122 | ++vcpu->stat.halt_wakeup; | 122 | ++vcpu->stat.halt_wakeup; |
123 | } | 123 | } |
124 | 124 | ||
@@ -701,8 +701,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) | |||
701 | tvcpu->arch.prodded = 1; | 701 | tvcpu->arch.prodded = 1; |
702 | smp_mb(); | 702 | smp_mb(); |
703 | if (vcpu->arch.ceded) { | 703 | if (vcpu->arch.ceded) { |
704 | if (waitqueue_active(&vcpu->wq)) { | 704 | if (swait_active(&vcpu->wq)) { |
705 | wake_up_interruptible(&vcpu->wq); | 705 | swake_up(&vcpu->wq); |
706 | vcpu->stat.halt_wakeup++; | 706 | vcpu->stat.halt_wakeup++; |
707 | } | 707 | } |
708 | } | 708 | } |
@@ -1459,7 +1459,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) | |||
1459 | INIT_LIST_HEAD(&vcore->runnable_threads); | 1459 | INIT_LIST_HEAD(&vcore->runnable_threads); |
1460 | spin_lock_init(&vcore->lock); | 1460 | spin_lock_init(&vcore->lock); |
1461 | spin_lock_init(&vcore->stoltb_lock); | 1461 | spin_lock_init(&vcore->stoltb_lock); |
1462 | init_waitqueue_head(&vcore->wq); | 1462 | init_swait_queue_head(&vcore->wq); |
1463 | vcore->preempt_tb = TB_NIL; | 1463 | vcore->preempt_tb = TB_NIL; |
1464 | vcore->lpcr = kvm->arch.lpcr; | 1464 | vcore->lpcr = kvm->arch.lpcr; |
1465 | vcore->first_vcpuid = core * threads_per_subcore; | 1465 | vcore->first_vcpuid = core * threads_per_subcore; |
@@ -2531,10 +2531,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) | |||
2531 | { | 2531 | { |
2532 | struct kvm_vcpu *vcpu; | 2532 | struct kvm_vcpu *vcpu; |
2533 | int do_sleep = 1; | 2533 | int do_sleep = 1; |
2534 | DECLARE_SWAITQUEUE(wait); | ||
2534 | 2535 | ||
2535 | DEFINE_WAIT(wait); | 2536 | prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); |
2536 | |||
2537 | prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE); | ||
2538 | 2537 | ||
2539 | /* | 2538 | /* |
2540 | * Check one last time for pending exceptions and ceded state after | 2539 | * Check one last time for pending exceptions and ceded state after |
@@ -2548,7 +2547,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) | |||
2548 | } | 2547 | } |
2549 | 2548 | ||
2550 | if (!do_sleep) { | 2549 | if (!do_sleep) { |
2551 | finish_wait(&vc->wq, &wait); | 2550 | finish_swait(&vc->wq, &wait); |
2552 | return; | 2551 | return; |
2553 | } | 2552 | } |
2554 | 2553 | ||
@@ -2556,7 +2555,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) | |||
2556 | trace_kvmppc_vcore_blocked(vc, 0); | 2555 | trace_kvmppc_vcore_blocked(vc, 0); |
2557 | spin_unlock(&vc->lock); | 2556 | spin_unlock(&vc->lock); |
2558 | schedule(); | 2557 | schedule(); |
2559 | finish_wait(&vc->wq, &wait); | 2558 | finish_swait(&vc->wq, &wait); |
2560 | spin_lock(&vc->lock); | 2559 | spin_lock(&vc->lock); |
2561 | vc->vcore_state = VCORE_INACTIVE; | 2560 | vc->vcore_state = VCORE_INACTIVE; |
2562 | trace_kvmppc_vcore_blocked(vc, 1); | 2561 | trace_kvmppc_vcore_blocked(vc, 1); |
@@ -2612,7 +2611,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
2612 | kvmppc_start_thread(vcpu, vc); | 2611 | kvmppc_start_thread(vcpu, vc); |
2613 | trace_kvm_guest_enter(vcpu); | 2612 | trace_kvm_guest_enter(vcpu); |
2614 | } else if (vc->vcore_state == VCORE_SLEEPING) { | 2613 | } else if (vc->vcore_state == VCORE_SLEEPING) { |
2615 | wake_up(&vc->wq); | 2614 | swake_up(&vc->wq); |
2616 | } | 2615 | } |
2617 | 2616 | ||
2618 | } | 2617 | } |
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 8959ebb6d2c9..b0c8ad0799c7 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h | |||
@@ -467,7 +467,7 @@ struct kvm_s390_irq_payload { | |||
467 | struct kvm_s390_local_interrupt { | 467 | struct kvm_s390_local_interrupt { |
468 | spinlock_t lock; | 468 | spinlock_t lock; |
469 | struct kvm_s390_float_interrupt *float_int; | 469 | struct kvm_s390_float_interrupt *float_int; |
470 | wait_queue_head_t *wq; | 470 | struct swait_queue_head *wq; |
471 | atomic_t *cpuflags; | 471 | atomic_t *cpuflags; |
472 | DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS); | 472 | DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS); |
473 | struct kvm_s390_irq_payload irq; | 473 | struct kvm_s390_irq_payload irq; |
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f88ca72c3a77..9ffc73221792 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c | |||
@@ -966,13 +966,13 @@ no_timer: | |||
966 | 966 | ||
967 | void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) | 967 | void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) |
968 | { | 968 | { |
969 | if (waitqueue_active(&vcpu->wq)) { | 969 | if (swait_active(&vcpu->wq)) { |
970 | /* | 970 | /* |
971 | * The vcpu gave up the cpu voluntarily, mark it as a good | 971 | * The vcpu gave up the cpu voluntarily, mark it as a good |
972 | * yield-candidate. | 972 | * yield-candidate. |
973 | */ | 973 | */ |
974 | vcpu->preempted = true; | 974 | vcpu->preempted = true; |
975 | wake_up_interruptible(&vcpu->wq); | 975 | swake_up(&vcpu->wq); |
976 | vcpu->stat.halt_wakeup++; | 976 | vcpu->stat.halt_wakeup++; |
977 | } | 977 | } |
978 | } | 978 | } |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 36591faed13b..3a045f39ed81 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -1195,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic) | |||
1195 | static void apic_timer_expired(struct kvm_lapic *apic) | 1195 | static void apic_timer_expired(struct kvm_lapic *apic) |
1196 | { | 1196 | { |
1197 | struct kvm_vcpu *vcpu = apic->vcpu; | 1197 | struct kvm_vcpu *vcpu = apic->vcpu; |
1198 | wait_queue_head_t *q = &vcpu->wq; | 1198 | struct swait_queue_head *q = &vcpu->wq; |
1199 | struct kvm_timer *ktimer = &apic->lapic_timer; | 1199 | struct kvm_timer *ktimer = &apic->lapic_timer; |
1200 | 1200 | ||
1201 | if (atomic_read(&apic->lapic_timer.pending)) | 1201 | if (atomic_read(&apic->lapic_timer.pending)) |
@@ -1204,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic) | |||
1204 | atomic_inc(&apic->lapic_timer.pending); | 1204 | atomic_inc(&apic->lapic_timer.pending); |
1205 | kvm_set_pending_timer(vcpu); | 1205 | kvm_set_pending_timer(vcpu); |
1206 | 1206 | ||
1207 | if (waitqueue_active(q)) | 1207 | if (swait_active(q)) |
1208 | wake_up_interruptible(q); | 1208 | swake_up(q); |
1209 | 1209 | ||
1210 | if (apic_lvtt_tscdeadline(apic)) | 1210 | if (apic_lvtt_tscdeadline(apic)) |
1211 | ktimer->expired_tscdeadline = ktimer->tscdeadline; | 1211 | ktimer->expired_tscdeadline = ktimer->tscdeadline; |
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index c2b340e23f62..6d9df3f7e334 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h | |||
@@ -713,6 +713,18 @@ static inline void __ftrace_enabled_restore(int enabled) | |||
713 | #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5)) | 713 | #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5)) |
714 | #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6)) | 714 | #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6)) |
715 | 715 | ||
716 | static inline unsigned long get_lock_parent_ip(void) | ||
717 | { | ||
718 | unsigned long addr = CALLER_ADDR0; | ||
719 | |||
720 | if (!in_lock_functions(addr)) | ||
721 | return addr; | ||
722 | addr = CALLER_ADDR1; | ||
723 | if (!in_lock_functions(addr)) | ||
724 | return addr; | ||
725 | return CALLER_ADDR2; | ||
726 | } | ||
727 | |||
716 | #ifdef CONFIG_IRQSOFF_TRACER | 728 | #ifdef CONFIG_IRQSOFF_TRACER |
717 | extern void time_hardirqs_on(unsigned long a0, unsigned long a1); | 729 | extern void time_hardirqs_on(unsigned long a0, unsigned long a1); |
718 | extern void time_hardirqs_off(unsigned long a0, unsigned long a1); | 730 | extern void time_hardirqs_off(unsigned long a0, unsigned long a1); |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 861f690aa791..5276fe0916fc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/irqflags.h> | 25 | #include <linux/irqflags.h> |
26 | #include <linux/context_tracking.h> | 26 | #include <linux/context_tracking.h> |
27 | #include <linux/irqbypass.h> | 27 | #include <linux/irqbypass.h> |
28 | #include <linux/swait.h> | ||
28 | #include <asm/signal.h> | 29 | #include <asm/signal.h> |
29 | 30 | ||
30 | #include <linux/kvm.h> | 31 | #include <linux/kvm.h> |
@@ -218,7 +219,7 @@ struct kvm_vcpu { | |||
218 | int fpu_active; | 219 | int fpu_active; |
219 | int guest_fpu_loaded, guest_xcr0_loaded; | 220 | int guest_fpu_loaded, guest_xcr0_loaded; |
220 | unsigned char fpu_counter; | 221 | unsigned char fpu_counter; |
221 | wait_queue_head_t wq; | 222 | struct swait_queue_head wq; |
222 | struct pid *pid; | 223 | struct pid *pid; |
223 | int sigset_active; | 224 | int sigset_active; |
224 | sigset_t sigset; | 225 | sigset_t sigset; |
@@ -782,7 +783,7 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm) | |||
782 | } | 783 | } |
783 | #endif | 784 | #endif |
784 | 785 | ||
785 | static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) | 786 | static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) |
786 | { | 787 | { |
787 | #ifdef __KVM_HAVE_ARCH_WQP | 788 | #ifdef __KVM_HAVE_ARCH_WQP |
788 | return vcpu->arch.wqp; | 789 | return vcpu->arch.wqp; |
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index e23121f9d82a..59ccab297ae0 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h | |||
@@ -37,6 +37,9 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) | |||
37 | 37 | ||
38 | void clear_all_latency_tracing(struct task_struct *p); | 38 | void clear_all_latency_tracing(struct task_struct *p); |
39 | 39 | ||
40 | extern int sysctl_latencytop(struct ctl_table *table, int write, | ||
41 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
42 | |||
40 | #else | 43 | #else |
41 | 44 | ||
42 | static inline void | 45 | static inline void |
diff --git a/include/linux/sched.h b/include/linux/sched.h index a10494a94cc3..838a89a78332 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -182,8 +182,6 @@ extern void update_cpu_load_nohz(int active); | |||
182 | static inline void update_cpu_load_nohz(int active) { } | 182 | static inline void update_cpu_load_nohz(int active) { } |
183 | #endif | 183 | #endif |
184 | 184 | ||
185 | extern unsigned long get_parent_ip(unsigned long addr); | ||
186 | |||
187 | extern void dump_cpu_task(int cpu); | 185 | extern void dump_cpu_task(int cpu); |
188 | 186 | ||
189 | struct seq_file; | 187 | struct seq_file; |
@@ -920,6 +918,10 @@ static inline int sched_info_on(void) | |||
920 | #endif | 918 | #endif |
921 | } | 919 | } |
922 | 920 | ||
921 | #ifdef CONFIG_SCHEDSTATS | ||
922 | void force_schedstat_enabled(void); | ||
923 | #endif | ||
924 | |||
923 | enum cpu_idle_type { | 925 | enum cpu_idle_type { |
924 | CPU_IDLE, | 926 | CPU_IDLE, |
925 | CPU_NOT_IDLE, | 927 | CPU_NOT_IDLE, |
@@ -1289,6 +1291,8 @@ struct sched_rt_entity { | |||
1289 | unsigned long timeout; | 1291 | unsigned long timeout; |
1290 | unsigned long watchdog_stamp; | 1292 | unsigned long watchdog_stamp; |
1291 | unsigned int time_slice; | 1293 | unsigned int time_slice; |
1294 | unsigned short on_rq; | ||
1295 | unsigned short on_list; | ||
1292 | 1296 | ||
1293 | struct sched_rt_entity *back; | 1297 | struct sched_rt_entity *back; |
1294 | #ifdef CONFIG_RT_GROUP_SCHED | 1298 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -1329,10 +1333,6 @@ struct sched_dl_entity { | |||
1329 | * task has to wait for a replenishment to be performed at the | 1333 | * task has to wait for a replenishment to be performed at the |
1330 | * next firing of dl_timer. | 1334 | * next firing of dl_timer. |
1331 | * | 1335 | * |
1332 | * @dl_new tells if a new instance arrived. If so we must | ||
1333 | * start executing it with full runtime and reset its absolute | ||
1334 | * deadline; | ||
1335 | * | ||
1336 | * @dl_boosted tells if we are boosted due to DI. If so we are | 1336 | * @dl_boosted tells if we are boosted due to DI. If so we are |
1337 | * outside bandwidth enforcement mechanism (but only until we | 1337 | * outside bandwidth enforcement mechanism (but only until we |
1338 | * exit the critical section); | 1338 | * exit the critical section); |
@@ -1340,7 +1340,7 @@ struct sched_dl_entity { | |||
1340 | * @dl_yielded tells if task gave up the cpu before consuming | 1340 | * @dl_yielded tells if task gave up the cpu before consuming |
1341 | * all its available runtime during the last job. | 1341 | * all its available runtime during the last job. |
1342 | */ | 1342 | */ |
1343 | int dl_throttled, dl_new, dl_boosted, dl_yielded; | 1343 | int dl_throttled, dl_boosted, dl_yielded; |
1344 | 1344 | ||
1345 | /* | 1345 | /* |
1346 | * Bandwidth enforcement timer. Each -deadline task has its | 1346 | * Bandwidth enforcement timer. Each -deadline task has its |
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c9e4731cf10b..4f080ab4f2cd 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h | |||
@@ -95,4 +95,8 @@ extern int sysctl_numa_balancing(struct ctl_table *table, int write, | |||
95 | void __user *buffer, size_t *lenp, | 95 | void __user *buffer, size_t *lenp, |
96 | loff_t *ppos); | 96 | loff_t *ppos); |
97 | 97 | ||
98 | extern int sysctl_schedstats(struct ctl_table *table, int write, | ||
99 | void __user *buffer, size_t *lenp, | ||
100 | loff_t *ppos); | ||
101 | |||
98 | #endif /* _SCHED_SYSCTL_H */ | 102 | #endif /* _SCHED_SYSCTL_H */ |
diff --git a/include/linux/swait.h b/include/linux/swait.h new file mode 100644 index 000000000000..c1f9c62a8a50 --- /dev/null +++ b/include/linux/swait.h | |||
@@ -0,0 +1,172 @@ | |||
1 | #ifndef _LINUX_SWAIT_H | ||
2 | #define _LINUX_SWAIT_H | ||
3 | |||
4 | #include <linux/list.h> | ||
5 | #include <linux/stddef.h> | ||
6 | #include <linux/spinlock.h> | ||
7 | #include <asm/current.h> | ||
8 | |||
9 | /* | ||
10 | * Simple wait queues | ||
11 | * | ||
12 | * While these are very similar to the other/complex wait queues (wait.h) the | ||
13 | * most important difference is that the simple waitqueue allows for | ||
14 | * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold | ||
15 | * times. | ||
16 | * | ||
17 | * In order to make this so, we had to drop a fair number of features of the | ||
18 | * other waitqueue code; notably: | ||
19 | * | ||
20 | * - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue; | ||
21 | * all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right | ||
22 | * sleeper state. | ||
23 | * | ||
24 | * - the exclusive mode; because this requires preserving the list order | ||
25 | * and this is hard. | ||
26 | * | ||
27 | * - custom wake functions; because you cannot give any guarantees about | ||
28 | * random code. | ||
29 | * | ||
30 | * As a side effect of this; the data structures are slimmer. | ||
31 | * | ||
32 | * One would recommend using this wait queue where possible. | ||
33 | */ | ||
34 | |||
35 | struct task_struct; | ||
36 | |||
37 | struct swait_queue_head { | ||
38 | raw_spinlock_t lock; | ||
39 | struct list_head task_list; | ||
40 | }; | ||
41 | |||
42 | struct swait_queue { | ||
43 | struct task_struct *task; | ||
44 | struct list_head task_list; | ||
45 | }; | ||
46 | |||
47 | #define __SWAITQUEUE_INITIALIZER(name) { \ | ||
48 | .task = current, \ | ||
49 | .task_list = LIST_HEAD_INIT((name).task_list), \ | ||
50 | } | ||
51 | |||
52 | #define DECLARE_SWAITQUEUE(name) \ | ||
53 | struct swait_queue name = __SWAITQUEUE_INITIALIZER(name) | ||
54 | |||
55 | #define __SWAIT_QUEUE_HEAD_INITIALIZER(name) { \ | ||
56 | .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ | ||
57 | .task_list = LIST_HEAD_INIT((name).task_list), \ | ||
58 | } | ||
59 | |||
60 | #define DECLARE_SWAIT_QUEUE_HEAD(name) \ | ||
61 | struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name) | ||
62 | |||
63 | extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name, | ||
64 | struct lock_class_key *key); | ||
65 | |||
66 | #define init_swait_queue_head(q) \ | ||
67 | do { \ | ||
68 | static struct lock_class_key __key; \ | ||
69 | __init_swait_queue_head((q), #q, &__key); \ | ||
70 | } while (0) | ||
71 | |||
72 | #ifdef CONFIG_LOCKDEP | ||
73 | # define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ | ||
74 | ({ init_swait_queue_head(&name); name; }) | ||
75 | # define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name) \ | ||
76 | struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name) | ||
77 | #else | ||
78 | # define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name) \ | ||
79 | DECLARE_SWAIT_QUEUE_HEAD(name) | ||
80 | #endif | ||
81 | |||
82 | static inline int swait_active(struct swait_queue_head *q) | ||
83 | { | ||
84 | return !list_empty(&q->task_list); | ||
85 | } | ||
86 | |||
87 | extern void swake_up(struct swait_queue_head *q); | ||
88 | extern void swake_up_all(struct swait_queue_head *q); | ||
89 | extern void swake_up_locked(struct swait_queue_head *q); | ||
90 | |||
91 | extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); | ||
92 | extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state); | ||
93 | extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state); | ||
94 | |||
95 | extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait); | ||
96 | extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait); | ||
97 | |||
98 | /* as per ___wait_event() but for swait, therefore "exclusive == 0" */ | ||
99 | #define ___swait_event(wq, condition, state, ret, cmd) \ | ||
100 | ({ \ | ||
101 | struct swait_queue __wait; \ | ||
102 | long __ret = ret; \ | ||
103 | \ | ||
104 | INIT_LIST_HEAD(&__wait.task_list); \ | ||
105 | for (;;) { \ | ||
106 | long __int = prepare_to_swait_event(&wq, &__wait, state);\ | ||
107 | \ | ||
108 | if (condition) \ | ||
109 | break; \ | ||
110 | \ | ||
111 | if (___wait_is_interruptible(state) && __int) { \ | ||
112 | __ret = __int; \ | ||
113 | break; \ | ||
114 | } \ | ||
115 | \ | ||
116 | cmd; \ | ||
117 | } \ | ||
118 | finish_swait(&wq, &__wait); \ | ||
119 | __ret; \ | ||
120 | }) | ||
121 | |||
122 | #define __swait_event(wq, condition) \ | ||
123 | (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \ | ||
124 | schedule()) | ||
125 | |||
126 | #define swait_event(wq, condition) \ | ||
127 | do { \ | ||
128 | if (condition) \ | ||
129 | break; \ | ||
130 | __swait_event(wq, condition); \ | ||
131 | } while (0) | ||
132 | |||
133 | #define __swait_event_timeout(wq, condition, timeout) \ | ||
134 | ___swait_event(wq, ___wait_cond_timeout(condition), \ | ||
135 | TASK_UNINTERRUPTIBLE, timeout, \ | ||
136 | __ret = schedule_timeout(__ret)) | ||
137 | |||
138 | #define swait_event_timeout(wq, condition, timeout) \ | ||
139 | ({ \ | ||
140 | long __ret = timeout; \ | ||
141 | if (!___wait_cond_timeout(condition)) \ | ||
142 | __ret = __swait_event_timeout(wq, condition, timeout); \ | ||
143 | __ret; \ | ||
144 | }) | ||
145 | |||
146 | #define __swait_event_interruptible(wq, condition) \ | ||
147 | ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \ | ||
148 | schedule()) | ||
149 | |||
150 | #define swait_event_interruptible(wq, condition) \ | ||
151 | ({ \ | ||
152 | int __ret = 0; \ | ||
153 | if (!(condition)) \ | ||
154 | __ret = __swait_event_interruptible(wq, condition); \ | ||
155 | __ret; \ | ||
156 | }) | ||
157 | |||
158 | #define __swait_event_interruptible_timeout(wq, condition, timeout) \ | ||
159 | ___swait_event(wq, ___wait_cond_timeout(condition), \ | ||
160 | TASK_INTERRUPTIBLE, timeout, \ | ||
161 | __ret = schedule_timeout(__ret)) | ||
162 | |||
163 | #define swait_event_interruptible_timeout(wq, condition, timeout) \ | ||
164 | ({ \ | ||
165 | long __ret = timeout; \ | ||
166 | if (!___wait_cond_timeout(condition)) \ | ||
167 | __ret = __swait_event_interruptible_timeout(wq, \ | ||
168 | condition, timeout); \ | ||
169 | __ret; \ | ||
170 | }) | ||
171 | |||
172 | #endif /* _LINUX_SWAIT_H */ | ||
diff --git a/include/linux/wait.h b/include/linux/wait.h index ae71a769b89e..27d7a0ab5da3 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h | |||
@@ -338,7 +338,7 @@ do { \ | |||
338 | schedule(); try_to_freeze()) | 338 | schedule(); try_to_freeze()) |
339 | 339 | ||
340 | /** | 340 | /** |
341 | * wait_event - sleep (or freeze) until a condition gets true | 341 | * wait_event_freezable - sleep (or freeze) until a condition gets true |
342 | * @wq: the waitqueue to wait on | 342 | * @wq: the waitqueue to wait on |
343 | * @condition: a C expression for the event to wait for | 343 | * @condition: a C expression for the event to wait for |
344 | * | 344 | * |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a02812743a7e..b5c30d9f46c5 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
@@ -47,12 +47,12 @@ | |||
47 | * of times) | 47 | * of times) |
48 | */ | 48 | */ |
49 | 49 | ||
50 | #include <linux/latencytop.h> | ||
51 | #include <linux/kallsyms.h> | 50 | #include <linux/kallsyms.h> |
52 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
53 | #include <linux/notifier.h> | 52 | #include <linux/notifier.h> |
54 | #include <linux/spinlock.h> | 53 | #include <linux/spinlock.h> |
55 | #include <linux/proc_fs.h> | 54 | #include <linux/proc_fs.h> |
55 | #include <linux/latencytop.h> | ||
56 | #include <linux/export.h> | 56 | #include <linux/export.h> |
57 | #include <linux/sched.h> | 57 | #include <linux/sched.h> |
58 | #include <linux/list.h> | 58 | #include <linux/list.h> |
@@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void) | |||
289 | proc_create("latency_stats", 0644, NULL, &lstats_fops); | 289 | proc_create("latency_stats", 0644, NULL, &lstats_fops); |
290 | return 0; | 290 | return 0; |
291 | } | 291 | } |
292 | |||
293 | int sysctl_latencytop(struct ctl_table *table, int write, | ||
294 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
295 | { | ||
296 | int err; | ||
297 | |||
298 | err = proc_dointvec(table, write, buffer, lenp, ppos); | ||
299 | if (latencytop_enabled) | ||
300 | force_schedstat_enabled(); | ||
301 | |||
302 | return err; | ||
303 | } | ||
292 | device_initcall(init_lstats_procfs); | 304 | device_initcall(init_lstats_procfs); |
diff --git a/kernel/profile.c b/kernel/profile.c index 99513e1160e5..51369697466e 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -59,6 +59,7 @@ int profile_setup(char *str) | |||
59 | 59 | ||
60 | if (!strncmp(str, sleepstr, strlen(sleepstr))) { | 60 | if (!strncmp(str, sleepstr, strlen(sleepstr))) { |
61 | #ifdef CONFIG_SCHEDSTATS | 61 | #ifdef CONFIG_SCHEDSTATS |
62 | force_schedstat_enabled(); | ||
62 | prof_on = SLEEP_PROFILING; | 63 | prof_on = SLEEP_PROFILING; |
63 | if (str[strlen(sleepstr)] == ',') | 64 | if (str[strlen(sleepstr)] == ',') |
64 | str += strlen(sleepstr) + 1; | 65 | str += strlen(sleepstr) + 1; |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e41dd4131f7a..9fd5b628a88d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -1614,7 +1614,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | |||
1614 | int needmore; | 1614 | int needmore; |
1615 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | 1615 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
1616 | 1616 | ||
1617 | rcu_nocb_gp_cleanup(rsp, rnp); | ||
1618 | rnp->need_future_gp[c & 0x1] = 0; | 1617 | rnp->need_future_gp[c & 0x1] = 0; |
1619 | needmore = rnp->need_future_gp[(c + 1) & 0x1]; | 1618 | needmore = rnp->need_future_gp[(c + 1) & 0x1]; |
1620 | trace_rcu_future_gp(rnp, rdp, c, | 1619 | trace_rcu_future_gp(rnp, rdp, c, |
@@ -1635,7 +1634,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) | |||
1635 | !READ_ONCE(rsp->gp_flags) || | 1634 | !READ_ONCE(rsp->gp_flags) || |
1636 | !rsp->gp_kthread) | 1635 | !rsp->gp_kthread) |
1637 | return; | 1636 | return; |
1638 | wake_up(&rsp->gp_wq); | 1637 | swake_up(&rsp->gp_wq); |
1639 | } | 1638 | } |
1640 | 1639 | ||
1641 | /* | 1640 | /* |
@@ -2010,6 +2009,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
2010 | int nocb = 0; | 2009 | int nocb = 0; |
2011 | struct rcu_data *rdp; | 2010 | struct rcu_data *rdp; |
2012 | struct rcu_node *rnp = rcu_get_root(rsp); | 2011 | struct rcu_node *rnp = rcu_get_root(rsp); |
2012 | struct swait_queue_head *sq; | ||
2013 | 2013 | ||
2014 | WRITE_ONCE(rsp->gp_activity, jiffies); | 2014 | WRITE_ONCE(rsp->gp_activity, jiffies); |
2015 | raw_spin_lock_irq_rcu_node(rnp); | 2015 | raw_spin_lock_irq_rcu_node(rnp); |
@@ -2046,7 +2046,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
2046 | needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; | 2046 | needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; |
2047 | /* smp_mb() provided by prior unlock-lock pair. */ | 2047 | /* smp_mb() provided by prior unlock-lock pair. */ |
2048 | nocb += rcu_future_gp_cleanup(rsp, rnp); | 2048 | nocb += rcu_future_gp_cleanup(rsp, rnp); |
2049 | sq = rcu_nocb_gp_get(rnp); | ||
2049 | raw_spin_unlock_irq(&rnp->lock); | 2050 | raw_spin_unlock_irq(&rnp->lock); |
2051 | rcu_nocb_gp_cleanup(sq); | ||
2050 | cond_resched_rcu_qs(); | 2052 | cond_resched_rcu_qs(); |
2051 | WRITE_ONCE(rsp->gp_activity, jiffies); | 2053 | WRITE_ONCE(rsp->gp_activity, jiffies); |
2052 | rcu_gp_slow(rsp, gp_cleanup_delay); | 2054 | rcu_gp_slow(rsp, gp_cleanup_delay); |
@@ -2092,7 +2094,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
2092 | READ_ONCE(rsp->gpnum), | 2094 | READ_ONCE(rsp->gpnum), |
2093 | TPS("reqwait")); | 2095 | TPS("reqwait")); |
2094 | rsp->gp_state = RCU_GP_WAIT_GPS; | 2096 | rsp->gp_state = RCU_GP_WAIT_GPS; |
2095 | wait_event_interruptible(rsp->gp_wq, | 2097 | swait_event_interruptible(rsp->gp_wq, |
2096 | READ_ONCE(rsp->gp_flags) & | 2098 | READ_ONCE(rsp->gp_flags) & |
2097 | RCU_GP_FLAG_INIT); | 2099 | RCU_GP_FLAG_INIT); |
2098 | rsp->gp_state = RCU_GP_DONE_GPS; | 2100 | rsp->gp_state = RCU_GP_DONE_GPS; |
@@ -2122,7 +2124,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
2122 | READ_ONCE(rsp->gpnum), | 2124 | READ_ONCE(rsp->gpnum), |
2123 | TPS("fqswait")); | 2125 | TPS("fqswait")); |
2124 | rsp->gp_state = RCU_GP_WAIT_FQS; | 2126 | rsp->gp_state = RCU_GP_WAIT_FQS; |
2125 | ret = wait_event_interruptible_timeout(rsp->gp_wq, | 2127 | ret = swait_event_interruptible_timeout(rsp->gp_wq, |
2126 | rcu_gp_fqs_check_wake(rsp, &gf), j); | 2128 | rcu_gp_fqs_check_wake(rsp, &gf), j); |
2127 | rsp->gp_state = RCU_GP_DOING_FQS; | 2129 | rsp->gp_state = RCU_GP_DOING_FQS; |
2128 | /* Locking provides needed memory barriers. */ | 2130 | /* Locking provides needed memory barriers. */ |
@@ -2246,7 +2248,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | |||
2246 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 2248 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
2247 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); | 2249 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); |
2248 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); | 2250 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); |
2249 | rcu_gp_kthread_wake(rsp); | 2251 | swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ |
2250 | } | 2252 | } |
2251 | 2253 | ||
2252 | /* | 2254 | /* |
@@ -2900,7 +2902,7 @@ static void force_quiescent_state(struct rcu_state *rsp) | |||
2900 | } | 2902 | } |
2901 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); | 2903 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); |
2902 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); | 2904 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); |
2903 | rcu_gp_kthread_wake(rsp); | 2905 | swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ |
2904 | } | 2906 | } |
2905 | 2907 | ||
2906 | /* | 2908 | /* |
@@ -3529,7 +3531,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | |||
3529 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 3531 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
3530 | if (wake) { | 3532 | if (wake) { |
3531 | smp_mb(); /* EGP done before wake_up(). */ | 3533 | smp_mb(); /* EGP done before wake_up(). */ |
3532 | wake_up(&rsp->expedited_wq); | 3534 | swake_up(&rsp->expedited_wq); |
3533 | } | 3535 | } |
3534 | break; | 3536 | break; |
3535 | } | 3537 | } |
@@ -3780,7 +3782,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | |||
3780 | jiffies_start = jiffies; | 3782 | jiffies_start = jiffies; |
3781 | 3783 | ||
3782 | for (;;) { | 3784 | for (;;) { |
3783 | ret = wait_event_interruptible_timeout( | 3785 | ret = swait_event_timeout( |
3784 | rsp->expedited_wq, | 3786 | rsp->expedited_wq, |
3785 | sync_rcu_preempt_exp_done(rnp_root), | 3787 | sync_rcu_preempt_exp_done(rnp_root), |
3786 | jiffies_stall); | 3788 | jiffies_stall); |
@@ -3788,7 +3790,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | |||
3788 | return; | 3790 | return; |
3789 | if (ret < 0) { | 3791 | if (ret < 0) { |
3790 | /* Hit a signal, disable CPU stall warnings. */ | 3792 | /* Hit a signal, disable CPU stall warnings. */ |
3791 | wait_event(rsp->expedited_wq, | 3793 | swait_event(rsp->expedited_wq, |
3792 | sync_rcu_preempt_exp_done(rnp_root)); | 3794 | sync_rcu_preempt_exp_done(rnp_root)); |
3793 | return; | 3795 | return; |
3794 | } | 3796 | } |
@@ -4482,8 +4484,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
4482 | } | 4484 | } |
4483 | } | 4485 | } |
4484 | 4486 | ||
4485 | init_waitqueue_head(&rsp->gp_wq); | 4487 | init_swait_queue_head(&rsp->gp_wq); |
4486 | init_waitqueue_head(&rsp->expedited_wq); | 4488 | init_swait_queue_head(&rsp->expedited_wq); |
4487 | rnp = rsp->level[rcu_num_lvls - 1]; | 4489 | rnp = rsp->level[rcu_num_lvls - 1]; |
4488 | for_each_possible_cpu(i) { | 4490 | for_each_possible_cpu(i) { |
4489 | while (i > rnp->grphi) | 4491 | while (i > rnp->grphi) |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 83360b4f4352..bbd235d0e71f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/threads.h> | 27 | #include <linux/threads.h> |
28 | #include <linux/cpumask.h> | 28 | #include <linux/cpumask.h> |
29 | #include <linux/seqlock.h> | 29 | #include <linux/seqlock.h> |
30 | #include <linux/swait.h> | ||
30 | #include <linux/stop_machine.h> | 31 | #include <linux/stop_machine.h> |
31 | 32 | ||
32 | /* | 33 | /* |
@@ -243,7 +244,7 @@ struct rcu_node { | |||
243 | /* Refused to boost: not sure why, though. */ | 244 | /* Refused to boost: not sure why, though. */ |
244 | /* This can happen due to race conditions. */ | 245 | /* This can happen due to race conditions. */ |
245 | #ifdef CONFIG_RCU_NOCB_CPU | 246 | #ifdef CONFIG_RCU_NOCB_CPU |
246 | wait_queue_head_t nocb_gp_wq[2]; | 247 | struct swait_queue_head nocb_gp_wq[2]; |
247 | /* Place for rcu_nocb_kthread() to wait GP. */ | 248 | /* Place for rcu_nocb_kthread() to wait GP. */ |
248 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | 249 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ |
249 | int need_future_gp[2]; | 250 | int need_future_gp[2]; |
@@ -399,7 +400,7 @@ struct rcu_data { | |||
399 | atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ | 400 | atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ |
400 | struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ | 401 | struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ |
401 | struct rcu_head **nocb_follower_tail; | 402 | struct rcu_head **nocb_follower_tail; |
402 | wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ | 403 | struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ |
403 | struct task_struct *nocb_kthread; | 404 | struct task_struct *nocb_kthread; |
404 | int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ | 405 | int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ |
405 | 406 | ||
@@ -478,7 +479,7 @@ struct rcu_state { | |||
478 | unsigned long gpnum; /* Current gp number. */ | 479 | unsigned long gpnum; /* Current gp number. */ |
479 | unsigned long completed; /* # of last completed gp. */ | 480 | unsigned long completed; /* # of last completed gp. */ |
480 | struct task_struct *gp_kthread; /* Task for grace periods. */ | 481 | struct task_struct *gp_kthread; /* Task for grace periods. */ |
481 | wait_queue_head_t gp_wq; /* Where GP task waits. */ | 482 | struct swait_queue_head gp_wq; /* Where GP task waits. */ |
482 | short gp_flags; /* Commands for GP task. */ | 483 | short gp_flags; /* Commands for GP task. */ |
483 | short gp_state; /* GP kthread sleep state. */ | 484 | short gp_state; /* GP kthread sleep state. */ |
484 | 485 | ||
@@ -506,7 +507,7 @@ struct rcu_state { | |||
506 | unsigned long expedited_sequence; /* Take a ticket. */ | 507 | unsigned long expedited_sequence; /* Take a ticket. */ |
507 | atomic_long_t expedited_normal; /* # fallbacks to normal. */ | 508 | atomic_long_t expedited_normal; /* # fallbacks to normal. */ |
508 | atomic_t expedited_need_qs; /* # CPUs left to check in. */ | 509 | atomic_t expedited_need_qs; /* # CPUs left to check in. */ |
509 | wait_queue_head_t expedited_wq; /* Wait for check-ins. */ | 510 | struct swait_queue_head expedited_wq; /* Wait for check-ins. */ |
510 | int ncpus_snap; /* # CPUs seen last time. */ | 511 | int ncpus_snap; /* # CPUs seen last time. */ |
511 | 512 | ||
512 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 513 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
@@ -621,7 +622,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp); | |||
621 | static void increment_cpu_stall_ticks(void); | 622 | static void increment_cpu_stall_ticks(void); |
622 | static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); | 623 | static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); |
623 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); | 624 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); |
624 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); | 625 | static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); |
626 | static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); | ||
625 | static void rcu_init_one_nocb(struct rcu_node *rnp); | 627 | static void rcu_init_one_nocb(struct rcu_node *rnp); |
626 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | 628 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, |
627 | bool lazy, unsigned long flags); | 629 | bool lazy, unsigned long flags); |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9467a8b7e756..080bd202d360 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -1811,9 +1811,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll); | |||
1811 | * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended | 1811 | * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended |
1812 | * grace period. | 1812 | * grace period. |
1813 | */ | 1813 | */ |
1814 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | 1814 | static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) |
1815 | { | 1815 | { |
1816 | wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); | 1816 | swake_up_all(sq); |
1817 | } | 1817 | } |
1818 | 1818 | ||
1819 | /* | 1819 | /* |
@@ -1829,10 +1829,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) | |||
1829 | rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; | 1829 | rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; |
1830 | } | 1830 | } |
1831 | 1831 | ||
1832 | static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) | ||
1833 | { | ||
1834 | return &rnp->nocb_gp_wq[rnp->completed & 0x1]; | ||
1835 | } | ||
1836 | |||
1832 | static void rcu_init_one_nocb(struct rcu_node *rnp) | 1837 | static void rcu_init_one_nocb(struct rcu_node *rnp) |
1833 | { | 1838 | { |
1834 | init_waitqueue_head(&rnp->nocb_gp_wq[0]); | 1839 | init_swait_queue_head(&rnp->nocb_gp_wq[0]); |
1835 | init_waitqueue_head(&rnp->nocb_gp_wq[1]); | 1840 | init_swait_queue_head(&rnp->nocb_gp_wq[1]); |
1836 | } | 1841 | } |
1837 | 1842 | ||
1838 | #ifndef CONFIG_RCU_NOCB_CPU_ALL | 1843 | #ifndef CONFIG_RCU_NOCB_CPU_ALL |
@@ -1857,7 +1862,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) | |||
1857 | if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { | 1862 | if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { |
1858 | /* Prior smp_mb__after_atomic() orders against prior enqueue. */ | 1863 | /* Prior smp_mb__after_atomic() orders against prior enqueue. */ |
1859 | WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); | 1864 | WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); |
1860 | wake_up(&rdp_leader->nocb_wq); | 1865 | swake_up(&rdp_leader->nocb_wq); |
1861 | } | 1866 | } |
1862 | } | 1867 | } |
1863 | 1868 | ||
@@ -2069,7 +2074,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
2069 | */ | 2074 | */ |
2070 | trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); | 2075 | trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); |
2071 | for (;;) { | 2076 | for (;;) { |
2072 | wait_event_interruptible( | 2077 | swait_event_interruptible( |
2073 | rnp->nocb_gp_wq[c & 0x1], | 2078 | rnp->nocb_gp_wq[c & 0x1], |
2074 | (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); | 2079 | (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); |
2075 | if (likely(d)) | 2080 | if (likely(d)) |
@@ -2097,7 +2102,7 @@ wait_again: | |||
2097 | /* Wait for callbacks to appear. */ | 2102 | /* Wait for callbacks to appear. */ |
2098 | if (!rcu_nocb_poll) { | 2103 | if (!rcu_nocb_poll) { |
2099 | trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); | 2104 | trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); |
2100 | wait_event_interruptible(my_rdp->nocb_wq, | 2105 | swait_event_interruptible(my_rdp->nocb_wq, |
2101 | !READ_ONCE(my_rdp->nocb_leader_sleep)); | 2106 | !READ_ONCE(my_rdp->nocb_leader_sleep)); |
2102 | /* Memory barrier handled by smp_mb() calls below and repoll. */ | 2107 | /* Memory barrier handled by smp_mb() calls below and repoll. */ |
2103 | } else if (firsttime) { | 2108 | } else if (firsttime) { |
@@ -2172,7 +2177,7 @@ wait_again: | |||
2172 | * List was empty, wake up the follower. | 2177 | * List was empty, wake up the follower. |
2173 | * Memory barriers supplied by atomic_long_add(). | 2178 | * Memory barriers supplied by atomic_long_add(). |
2174 | */ | 2179 | */ |
2175 | wake_up(&rdp->nocb_wq); | 2180 | swake_up(&rdp->nocb_wq); |
2176 | } | 2181 | } |
2177 | } | 2182 | } |
2178 | 2183 | ||
@@ -2193,7 +2198,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) | |||
2193 | if (!rcu_nocb_poll) { | 2198 | if (!rcu_nocb_poll) { |
2194 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 2199 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
2195 | "FollowerSleep"); | 2200 | "FollowerSleep"); |
2196 | wait_event_interruptible(rdp->nocb_wq, | 2201 | swait_event_interruptible(rdp->nocb_wq, |
2197 | READ_ONCE(rdp->nocb_follower_head)); | 2202 | READ_ONCE(rdp->nocb_follower_head)); |
2198 | } else if (firsttime) { | 2203 | } else if (firsttime) { |
2199 | /* Don't drown trace log with "Poll"! */ | 2204 | /* Don't drown trace log with "Poll"! */ |
@@ -2352,7 +2357,7 @@ void __init rcu_init_nohz(void) | |||
2352 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | 2357 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) |
2353 | { | 2358 | { |
2354 | rdp->nocb_tail = &rdp->nocb_head; | 2359 | rdp->nocb_tail = &rdp->nocb_head; |
2355 | init_waitqueue_head(&rdp->nocb_wq); | 2360 | init_swait_queue_head(&rdp->nocb_wq); |
2356 | rdp->nocb_follower_tail = &rdp->nocb_follower_head; | 2361 | rdp->nocb_follower_tail = &rdp->nocb_follower_head; |
2357 | } | 2362 | } |
2358 | 2363 | ||
@@ -2502,7 +2507,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) | |||
2502 | return false; | 2507 | return false; |
2503 | } | 2508 | } |
2504 | 2509 | ||
2505 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | 2510 | static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) |
2506 | { | 2511 | { |
2507 | } | 2512 | } |
2508 | 2513 | ||
@@ -2510,6 +2515,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) | |||
2510 | { | 2515 | { |
2511 | } | 2516 | } |
2512 | 2517 | ||
2518 | static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) | ||
2519 | { | ||
2520 | return NULL; | ||
2521 | } | ||
2522 | |||
2513 | static void rcu_init_one_nocb(struct rcu_node *rnp) | 2523 | static void rcu_init_one_nocb(struct rcu_node *rnp) |
2514 | { | 2524 | { |
2515 | } | 2525 | } |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..7d4cba227cbd 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -13,7 +13,7 @@ endif | |||
13 | 13 | ||
14 | obj-y += core.o loadavg.o clock.o cputime.o | 14 | obj-y += core.o loadavg.o clock.o cputime.o |
15 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o | 15 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o |
16 | obj-y += wait.o completion.o idle.o | 16 | obj-y += wait.o swait.o completion.o idle.o |
17 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o | 17 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o |
18 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 18 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
19 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 19 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 41f6b2215aa8..05114b15b6d1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -67,12 +67,10 @@ | |||
67 | #include <linux/pagemap.h> | 67 | #include <linux/pagemap.h> |
68 | #include <linux/hrtimer.h> | 68 | #include <linux/hrtimer.h> |
69 | #include <linux/tick.h> | 69 | #include <linux/tick.h> |
70 | #include <linux/debugfs.h> | ||
71 | #include <linux/ctype.h> | 70 | #include <linux/ctype.h> |
72 | #include <linux/ftrace.h> | 71 | #include <linux/ftrace.h> |
73 | #include <linux/slab.h> | 72 | #include <linux/slab.h> |
74 | #include <linux/init_task.h> | 73 | #include <linux/init_task.h> |
75 | #include <linux/binfmts.h> | ||
76 | #include <linux/context_tracking.h> | 74 | #include <linux/context_tracking.h> |
77 | #include <linux/compiler.h> | 75 | #include <linux/compiler.h> |
78 | 76 | ||
@@ -125,138 +123,6 @@ const_debug unsigned int sysctl_sched_features = | |||
125 | 123 | ||
126 | #undef SCHED_FEAT | 124 | #undef SCHED_FEAT |
127 | 125 | ||
128 | #ifdef CONFIG_SCHED_DEBUG | ||
129 | #define SCHED_FEAT(name, enabled) \ | ||
130 | #name , | ||
131 | |||
132 | static const char * const sched_feat_names[] = { | ||
133 | #include "features.h" | ||
134 | }; | ||
135 | |||
136 | #undef SCHED_FEAT | ||
137 | |||
138 | static int sched_feat_show(struct seq_file *m, void *v) | ||
139 | { | ||
140 | int i; | ||
141 | |||
142 | for (i = 0; i < __SCHED_FEAT_NR; i++) { | ||
143 | if (!(sysctl_sched_features & (1UL << i))) | ||
144 | seq_puts(m, "NO_"); | ||
145 | seq_printf(m, "%s ", sched_feat_names[i]); | ||
146 | } | ||
147 | seq_puts(m, "\n"); | ||
148 | |||
149 | return 0; | ||
150 | } | ||
151 | |||
152 | #ifdef HAVE_JUMP_LABEL | ||
153 | |||
154 | #define jump_label_key__true STATIC_KEY_INIT_TRUE | ||
155 | #define jump_label_key__false STATIC_KEY_INIT_FALSE | ||
156 | |||
157 | #define SCHED_FEAT(name, enabled) \ | ||
158 | jump_label_key__##enabled , | ||
159 | |||
160 | struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { | ||
161 | #include "features.h" | ||
162 | }; | ||
163 | |||
164 | #undef SCHED_FEAT | ||
165 | |||
166 | static void sched_feat_disable(int i) | ||
167 | { | ||
168 | static_key_disable(&sched_feat_keys[i]); | ||
169 | } | ||
170 | |||
171 | static void sched_feat_enable(int i) | ||
172 | { | ||
173 | static_key_enable(&sched_feat_keys[i]); | ||
174 | } | ||
175 | #else | ||
176 | static void sched_feat_disable(int i) { }; | ||
177 | static void sched_feat_enable(int i) { }; | ||
178 | #endif /* HAVE_JUMP_LABEL */ | ||
179 | |||
180 | static int sched_feat_set(char *cmp) | ||
181 | { | ||
182 | int i; | ||
183 | int neg = 0; | ||
184 | |||
185 | if (strncmp(cmp, "NO_", 3) == 0) { | ||
186 | neg = 1; | ||
187 | cmp += 3; | ||
188 | } | ||
189 | |||
190 | for (i = 0; i < __SCHED_FEAT_NR; i++) { | ||
191 | if (strcmp(cmp, sched_feat_names[i]) == 0) { | ||
192 | if (neg) { | ||
193 | sysctl_sched_features &= ~(1UL << i); | ||
194 | sched_feat_disable(i); | ||
195 | } else { | ||
196 | sysctl_sched_features |= (1UL << i); | ||
197 | sched_feat_enable(i); | ||
198 | } | ||
199 | break; | ||
200 | } | ||
201 | } | ||
202 | |||
203 | return i; | ||
204 | } | ||
205 | |||
206 | static ssize_t | ||
207 | sched_feat_write(struct file *filp, const char __user *ubuf, | ||
208 | size_t cnt, loff_t *ppos) | ||
209 | { | ||
210 | char buf[64]; | ||
211 | char *cmp; | ||
212 | int i; | ||
213 | struct inode *inode; | ||
214 | |||
215 | if (cnt > 63) | ||
216 | cnt = 63; | ||
217 | |||
218 | if (copy_from_user(&buf, ubuf, cnt)) | ||
219 | return -EFAULT; | ||
220 | |||
221 | buf[cnt] = 0; | ||
222 | cmp = strstrip(buf); | ||
223 | |||
224 | /* Ensure the static_key remains in a consistent state */ | ||
225 | inode = file_inode(filp); | ||
226 | inode_lock(inode); | ||
227 | i = sched_feat_set(cmp); | ||
228 | inode_unlock(inode); | ||
229 | if (i == __SCHED_FEAT_NR) | ||
230 | return -EINVAL; | ||
231 | |||
232 | *ppos += cnt; | ||
233 | |||
234 | return cnt; | ||
235 | } | ||
236 | |||
237 | static int sched_feat_open(struct inode *inode, struct file *filp) | ||
238 | { | ||
239 | return single_open(filp, sched_feat_show, NULL); | ||
240 | } | ||
241 | |||
242 | static const struct file_operations sched_feat_fops = { | ||
243 | .open = sched_feat_open, | ||
244 | .write = sched_feat_write, | ||
245 | .read = seq_read, | ||
246 | .llseek = seq_lseek, | ||
247 | .release = single_release, | ||
248 | }; | ||
249 | |||
250 | static __init int sched_init_debug(void) | ||
251 | { | ||
252 | debugfs_create_file("sched_features", 0644, NULL, NULL, | ||
253 | &sched_feat_fops); | ||
254 | |||
255 | return 0; | ||
256 | } | ||
257 | late_initcall(sched_init_debug); | ||
258 | #endif /* CONFIG_SCHED_DEBUG */ | ||
259 | |||
260 | /* | 126 | /* |
261 | * Number of tasks to iterate in a single balance run. | 127 | * Number of tasks to iterate in a single balance run. |
262 | * Limited because this is done with IRQs disabled. | 128 | * Limited because this is done with IRQs disabled. |
@@ -2094,7 +1960,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2094 | 1960 | ||
2095 | ttwu_queue(p, cpu); | 1961 | ttwu_queue(p, cpu); |
2096 | stat: | 1962 | stat: |
2097 | ttwu_stat(p, cpu, wake_flags); | 1963 | if (schedstat_enabled()) |
1964 | ttwu_stat(p, cpu, wake_flags); | ||
2098 | out: | 1965 | out: |
2099 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 1966 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2100 | 1967 | ||
@@ -2142,7 +2009,8 @@ static void try_to_wake_up_local(struct task_struct *p) | |||
2142 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 2009 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
2143 | 2010 | ||
2144 | ttwu_do_wakeup(rq, p, 0); | 2011 | ttwu_do_wakeup(rq, p, 0); |
2145 | ttwu_stat(p, smp_processor_id(), 0); | 2012 | if (schedstat_enabled()) |
2013 | ttwu_stat(p, smp_processor_id(), 0); | ||
2146 | out: | 2014 | out: |
2147 | raw_spin_unlock(&p->pi_lock); | 2015 | raw_spin_unlock(&p->pi_lock); |
2148 | } | 2016 | } |
@@ -2184,7 +2052,6 @@ void __dl_clear_params(struct task_struct *p) | |||
2184 | dl_se->dl_bw = 0; | 2052 | dl_se->dl_bw = 0; |
2185 | 2053 | ||
2186 | dl_se->dl_throttled = 0; | 2054 | dl_se->dl_throttled = 0; |
2187 | dl_se->dl_new = 1; | ||
2188 | dl_se->dl_yielded = 0; | 2055 | dl_se->dl_yielded = 0; |
2189 | } | 2056 | } |
2190 | 2057 | ||
@@ -2211,6 +2078,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2211 | #endif | 2078 | #endif |
2212 | 2079 | ||
2213 | #ifdef CONFIG_SCHEDSTATS | 2080 | #ifdef CONFIG_SCHEDSTATS |
2081 | /* Even if schedstat is disabled, there should not be garbage */ | ||
2214 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 2082 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
2215 | #endif | 2083 | #endif |
2216 | 2084 | ||
@@ -2219,6 +2087,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2219 | __dl_clear_params(p); | 2087 | __dl_clear_params(p); |
2220 | 2088 | ||
2221 | INIT_LIST_HEAD(&p->rt.run_list); | 2089 | INIT_LIST_HEAD(&p->rt.run_list); |
2090 | p->rt.timeout = 0; | ||
2091 | p->rt.time_slice = sched_rr_timeslice; | ||
2092 | p->rt.on_rq = 0; | ||
2093 | p->rt.on_list = 0; | ||
2222 | 2094 | ||
2223 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2095 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
2224 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2096 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
@@ -2282,6 +2154,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, | |||
2282 | #endif | 2154 | #endif |
2283 | #endif | 2155 | #endif |
2284 | 2156 | ||
2157 | DEFINE_STATIC_KEY_FALSE(sched_schedstats); | ||
2158 | |||
2159 | #ifdef CONFIG_SCHEDSTATS | ||
2160 | static void set_schedstats(bool enabled) | ||
2161 | { | ||
2162 | if (enabled) | ||
2163 | static_branch_enable(&sched_schedstats); | ||
2164 | else | ||
2165 | static_branch_disable(&sched_schedstats); | ||
2166 | } | ||
2167 | |||
2168 | void force_schedstat_enabled(void) | ||
2169 | { | ||
2170 | if (!schedstat_enabled()) { | ||
2171 | pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); | ||
2172 | static_branch_enable(&sched_schedstats); | ||
2173 | } | ||
2174 | } | ||
2175 | |||
2176 | static int __init setup_schedstats(char *str) | ||
2177 | { | ||
2178 | int ret = 0; | ||
2179 | if (!str) | ||
2180 | goto out; | ||
2181 | |||
2182 | if (!strcmp(str, "enable")) { | ||
2183 | set_schedstats(true); | ||
2184 | ret = 1; | ||
2185 | } else if (!strcmp(str, "disable")) { | ||
2186 | set_schedstats(false); | ||
2187 | ret = 1; | ||
2188 | } | ||
2189 | out: | ||
2190 | if (!ret) | ||
2191 | pr_warn("Unable to parse schedstats=\n"); | ||
2192 | |||
2193 | return ret; | ||
2194 | } | ||
2195 | __setup("schedstats=", setup_schedstats); | ||
2196 | |||
2197 | #ifdef CONFIG_PROC_SYSCTL | ||
2198 | int sysctl_schedstats(struct ctl_table *table, int write, | ||
2199 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2200 | { | ||
2201 | struct ctl_table t; | ||
2202 | int err; | ||
2203 | int state = static_branch_likely(&sched_schedstats); | ||
2204 | |||
2205 | if (write && !capable(CAP_SYS_ADMIN)) | ||
2206 | return -EPERM; | ||
2207 | |||
2208 | t = *table; | ||
2209 | t.data = &state; | ||
2210 | err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); | ||
2211 | if (err < 0) | ||
2212 | return err; | ||
2213 | if (write) | ||
2214 | set_schedstats(state); | ||
2215 | return err; | ||
2216 | } | ||
2217 | #endif | ||
2218 | #endif | ||
2219 | |||
2285 | /* | 2220 | /* |
2286 | * fork()/clone()-time setup: | 2221 | * fork()/clone()-time setup: |
2287 | */ | 2222 | */ |
@@ -3011,16 +2946,6 @@ u64 scheduler_tick_max_deferment(void) | |||
3011 | } | 2946 | } |
3012 | #endif | 2947 | #endif |
3013 | 2948 | ||
3014 | notrace unsigned long get_parent_ip(unsigned long addr) | ||
3015 | { | ||
3016 | if (in_lock_functions(addr)) { | ||
3017 | addr = CALLER_ADDR2; | ||
3018 | if (in_lock_functions(addr)) | ||
3019 | addr = CALLER_ADDR3; | ||
3020 | } | ||
3021 | return addr; | ||
3022 | } | ||
3023 | |||
3024 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ | 2949 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
3025 | defined(CONFIG_PREEMPT_TRACER)) | 2950 | defined(CONFIG_PREEMPT_TRACER)) |
3026 | 2951 | ||
@@ -3042,7 +2967,7 @@ void preempt_count_add(int val) | |||
3042 | PREEMPT_MASK - 10); | 2967 | PREEMPT_MASK - 10); |
3043 | #endif | 2968 | #endif |
3044 | if (preempt_count() == val) { | 2969 | if (preempt_count() == val) { |
3045 | unsigned long ip = get_parent_ip(CALLER_ADDR1); | 2970 | unsigned long ip = get_lock_parent_ip(); |
3046 | #ifdef CONFIG_DEBUG_PREEMPT | 2971 | #ifdef CONFIG_DEBUG_PREEMPT |
3047 | current->preempt_disable_ip = ip; | 2972 | current->preempt_disable_ip = ip; |
3048 | #endif | 2973 | #endif |
@@ -3069,7 +2994,7 @@ void preempt_count_sub(int val) | |||
3069 | #endif | 2994 | #endif |
3070 | 2995 | ||
3071 | if (preempt_count() == val) | 2996 | if (preempt_count() == val) |
3072 | trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 2997 | trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); |
3073 | __preempt_count_sub(val); | 2998 | __preempt_count_sub(val); |
3074 | } | 2999 | } |
3075 | EXPORT_SYMBOL(preempt_count_sub); | 3000 | EXPORT_SYMBOL(preempt_count_sub); |
@@ -3281,7 +3206,6 @@ static void __sched notrace __schedule(bool preempt) | |||
3281 | 3206 | ||
3282 | trace_sched_switch(preempt, prev, next); | 3207 | trace_sched_switch(preempt, prev, next); |
3283 | rq = context_switch(rq, prev, next); /* unlocks the rq */ | 3208 | rq = context_switch(rq, prev, next); /* unlocks the rq */ |
3284 | cpu = cpu_of(rq); | ||
3285 | } else { | 3209 | } else { |
3286 | lockdep_unpin_lock(&rq->lock); | 3210 | lockdep_unpin_lock(&rq->lock); |
3287 | raw_spin_unlock_irq(&rq->lock); | 3211 | raw_spin_unlock_irq(&rq->lock); |
@@ -3467,7 +3391,7 @@ EXPORT_SYMBOL(default_wake_function); | |||
3467 | */ | 3391 | */ |
3468 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3392 | void rt_mutex_setprio(struct task_struct *p, int prio) |
3469 | { | 3393 | { |
3470 | int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; | 3394 | int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; |
3471 | struct rq *rq; | 3395 | struct rq *rq; |
3472 | const struct sched_class *prev_class; | 3396 | const struct sched_class *prev_class; |
3473 | 3397 | ||
@@ -3495,11 +3419,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3495 | 3419 | ||
3496 | trace_sched_pi_setprio(p, prio); | 3420 | trace_sched_pi_setprio(p, prio); |
3497 | oldprio = p->prio; | 3421 | oldprio = p->prio; |
3422 | |||
3423 | if (oldprio == prio) | ||
3424 | queue_flag &= ~DEQUEUE_MOVE; | ||
3425 | |||
3498 | prev_class = p->sched_class; | 3426 | prev_class = p->sched_class; |
3499 | queued = task_on_rq_queued(p); | 3427 | queued = task_on_rq_queued(p); |
3500 | running = task_current(rq, p); | 3428 | running = task_current(rq, p); |
3501 | if (queued) | 3429 | if (queued) |
3502 | dequeue_task(rq, p, DEQUEUE_SAVE); | 3430 | dequeue_task(rq, p, queue_flag); |
3503 | if (running) | 3431 | if (running) |
3504 | put_prev_task(rq, p); | 3432 | put_prev_task(rq, p); |
3505 | 3433 | ||
@@ -3517,7 +3445,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3517 | if (!dl_prio(p->normal_prio) || | 3445 | if (!dl_prio(p->normal_prio) || |
3518 | (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { | 3446 | (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { |
3519 | p->dl.dl_boosted = 1; | 3447 | p->dl.dl_boosted = 1; |
3520 | enqueue_flag |= ENQUEUE_REPLENISH; | 3448 | queue_flag |= ENQUEUE_REPLENISH; |
3521 | } else | 3449 | } else |
3522 | p->dl.dl_boosted = 0; | 3450 | p->dl.dl_boosted = 0; |
3523 | p->sched_class = &dl_sched_class; | 3451 | p->sched_class = &dl_sched_class; |
@@ -3525,7 +3453,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3525 | if (dl_prio(oldprio)) | 3453 | if (dl_prio(oldprio)) |
3526 | p->dl.dl_boosted = 0; | 3454 | p->dl.dl_boosted = 0; |
3527 | if (oldprio < prio) | 3455 | if (oldprio < prio) |
3528 | enqueue_flag |= ENQUEUE_HEAD; | 3456 | queue_flag |= ENQUEUE_HEAD; |
3529 | p->sched_class = &rt_sched_class; | 3457 | p->sched_class = &rt_sched_class; |
3530 | } else { | 3458 | } else { |
3531 | if (dl_prio(oldprio)) | 3459 | if (dl_prio(oldprio)) |
@@ -3540,7 +3468,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3540 | if (running) | 3468 | if (running) |
3541 | p->sched_class->set_curr_task(rq); | 3469 | p->sched_class->set_curr_task(rq); |
3542 | if (queued) | 3470 | if (queued) |
3543 | enqueue_task(rq, p, enqueue_flag); | 3471 | enqueue_task(rq, p, queue_flag); |
3544 | 3472 | ||
3545 | check_class_changed(rq, p, prev_class, oldprio); | 3473 | check_class_changed(rq, p, prev_class, oldprio); |
3546 | out_unlock: | 3474 | out_unlock: |
@@ -3896,6 +3824,7 @@ static int __sched_setscheduler(struct task_struct *p, | |||
3896 | const struct sched_class *prev_class; | 3824 | const struct sched_class *prev_class; |
3897 | struct rq *rq; | 3825 | struct rq *rq; |
3898 | int reset_on_fork; | 3826 | int reset_on_fork; |
3827 | int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; | ||
3899 | 3828 | ||
3900 | /* may grab non-irq protected spin_locks */ | 3829 | /* may grab non-irq protected spin_locks */ |
3901 | BUG_ON(in_interrupt()); | 3830 | BUG_ON(in_interrupt()); |
@@ -4078,17 +4007,14 @@ change: | |||
4078 | * itself. | 4007 | * itself. |
4079 | */ | 4008 | */ |
4080 | new_effective_prio = rt_mutex_get_effective_prio(p, newprio); | 4009 | new_effective_prio = rt_mutex_get_effective_prio(p, newprio); |
4081 | if (new_effective_prio == oldprio) { | 4010 | if (new_effective_prio == oldprio) |
4082 | __setscheduler_params(p, attr); | 4011 | queue_flags &= ~DEQUEUE_MOVE; |
4083 | task_rq_unlock(rq, p, &flags); | ||
4084 | return 0; | ||
4085 | } | ||
4086 | } | 4012 | } |
4087 | 4013 | ||
4088 | queued = task_on_rq_queued(p); | 4014 | queued = task_on_rq_queued(p); |
4089 | running = task_current(rq, p); | 4015 | running = task_current(rq, p); |
4090 | if (queued) | 4016 | if (queued) |
4091 | dequeue_task(rq, p, DEQUEUE_SAVE); | 4017 | dequeue_task(rq, p, queue_flags); |
4092 | if (running) | 4018 | if (running) |
4093 | put_prev_task(rq, p); | 4019 | put_prev_task(rq, p); |
4094 | 4020 | ||
@@ -4098,15 +4024,14 @@ change: | |||
4098 | if (running) | 4024 | if (running) |
4099 | p->sched_class->set_curr_task(rq); | 4025 | p->sched_class->set_curr_task(rq); |
4100 | if (queued) { | 4026 | if (queued) { |
4101 | int enqueue_flags = ENQUEUE_RESTORE; | ||
4102 | /* | 4027 | /* |
4103 | * We enqueue to tail when the priority of a task is | 4028 | * We enqueue to tail when the priority of a task is |
4104 | * increased (user space view). | 4029 | * increased (user space view). |
4105 | */ | 4030 | */ |
4106 | if (oldprio <= p->prio) | 4031 | if (oldprio < p->prio) |
4107 | enqueue_flags |= ENQUEUE_HEAD; | 4032 | queue_flags |= ENQUEUE_HEAD; |
4108 | 4033 | ||
4109 | enqueue_task(rq, p, enqueue_flags); | 4034 | enqueue_task(rq, p, queue_flags); |
4110 | } | 4035 | } |
4111 | 4036 | ||
4112 | check_class_changed(rq, p, prev_class, oldprio); | 4037 | check_class_changed(rq, p, prev_class, oldprio); |
@@ -5408,183 +5333,6 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5408 | } | 5333 | } |
5409 | #endif /* CONFIG_HOTPLUG_CPU */ | 5334 | #endif /* CONFIG_HOTPLUG_CPU */ |
5410 | 5335 | ||
5411 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | ||
5412 | |||
5413 | static struct ctl_table sd_ctl_dir[] = { | ||
5414 | { | ||
5415 | .procname = "sched_domain", | ||
5416 | .mode = 0555, | ||
5417 | }, | ||
5418 | {} | ||
5419 | }; | ||
5420 | |||
5421 | static struct ctl_table sd_ctl_root[] = { | ||
5422 | { | ||
5423 | .procname = "kernel", | ||
5424 | .mode = 0555, | ||
5425 | .child = sd_ctl_dir, | ||
5426 | }, | ||
5427 | {} | ||
5428 | }; | ||
5429 | |||
5430 | static struct ctl_table *sd_alloc_ctl_entry(int n) | ||
5431 | { | ||
5432 | struct ctl_table *entry = | ||
5433 | kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); | ||
5434 | |||
5435 | return entry; | ||
5436 | } | ||
5437 | |||
5438 | static void sd_free_ctl_entry(struct ctl_table **tablep) | ||
5439 | { | ||
5440 | struct ctl_table *entry; | ||
5441 | |||
5442 | /* | ||
5443 | * In the intermediate directories, both the child directory and | ||
5444 | * procname are dynamically allocated and could fail but the mode | ||
5445 | * will always be set. In the lowest directory the names are | ||
5446 | * static strings and all have proc handlers. | ||
5447 | */ | ||
5448 | for (entry = *tablep; entry->mode; entry++) { | ||
5449 | if (entry->child) | ||
5450 | sd_free_ctl_entry(&entry->child); | ||
5451 | if (entry->proc_handler == NULL) | ||
5452 | kfree(entry->procname); | ||
5453 | } | ||
5454 | |||
5455 | kfree(*tablep); | ||
5456 | *tablep = NULL; | ||
5457 | } | ||
5458 | |||
5459 | static int min_load_idx = 0; | ||
5460 | static int max_load_idx = CPU_LOAD_IDX_MAX-1; | ||
5461 | |||
5462 | static void | ||
5463 | set_table_entry(struct ctl_table *entry, | ||
5464 | const char *procname, void *data, int maxlen, | ||
5465 | umode_t mode, proc_handler *proc_handler, | ||
5466 | bool load_idx) | ||
5467 | { | ||
5468 | entry->procname = procname; | ||
5469 | entry->data = data; | ||
5470 | entry->maxlen = maxlen; | ||
5471 | entry->mode = mode; | ||
5472 | entry->proc_handler = proc_handler; | ||
5473 | |||
5474 | if (load_idx) { | ||
5475 | entry->extra1 = &min_load_idx; | ||
5476 | entry->extra2 = &max_load_idx; | ||
5477 | } | ||
5478 | } | ||
5479 | |||
5480 | static struct ctl_table * | ||
5481 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | ||
5482 | { | ||
5483 | struct ctl_table *table = sd_alloc_ctl_entry(14); | ||
5484 | |||
5485 | if (table == NULL) | ||
5486 | return NULL; | ||
5487 | |||
5488 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | ||
5489 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
5490 | set_table_entry(&table[1], "max_interval", &sd->max_interval, | ||
5491 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
5492 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, | ||
5493 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
5494 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, | ||
5495 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
5496 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, | ||
5497 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
5498 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, | ||
5499 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
5500 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, | ||
5501 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
5502 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, | ||
5503 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
5504 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | ||
5505 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
5506 | set_table_entry(&table[9], "cache_nice_tries", | ||
5507 | &sd->cache_nice_tries, | ||
5508 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
5509 | set_table_entry(&table[10], "flags", &sd->flags, | ||
5510 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
5511 | set_table_entry(&table[11], "max_newidle_lb_cost", | ||
5512 | &sd->max_newidle_lb_cost, | ||
5513 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
5514 | set_table_entry(&table[12], "name", sd->name, | ||
5515 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); | ||
5516 | /* &table[13] is terminator */ | ||
5517 | |||
5518 | return table; | ||
5519 | } | ||
5520 | |||
5521 | static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) | ||
5522 | { | ||
5523 | struct ctl_table *entry, *table; | ||
5524 | struct sched_domain *sd; | ||
5525 | int domain_num = 0, i; | ||
5526 | char buf[32]; | ||
5527 | |||
5528 | for_each_domain(cpu, sd) | ||
5529 | domain_num++; | ||
5530 | entry = table = sd_alloc_ctl_entry(domain_num + 1); | ||
5531 | if (table == NULL) | ||
5532 | return NULL; | ||
5533 | |||
5534 | i = 0; | ||
5535 | for_each_domain(cpu, sd) { | ||
5536 | snprintf(buf, 32, "domain%d", i); | ||
5537 | entry->procname = kstrdup(buf, GFP_KERNEL); | ||
5538 | entry->mode = 0555; | ||
5539 | entry->child = sd_alloc_ctl_domain_table(sd); | ||
5540 | entry++; | ||
5541 | i++; | ||
5542 | } | ||
5543 | return table; | ||
5544 | } | ||
5545 | |||
5546 | static struct ctl_table_header *sd_sysctl_header; | ||
5547 | static void register_sched_domain_sysctl(void) | ||
5548 | { | ||
5549 | int i, cpu_num = num_possible_cpus(); | ||
5550 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); | ||
5551 | char buf[32]; | ||
5552 | |||
5553 | WARN_ON(sd_ctl_dir[0].child); | ||
5554 | sd_ctl_dir[0].child = entry; | ||
5555 | |||
5556 | if (entry == NULL) | ||
5557 | return; | ||
5558 | |||
5559 | for_each_possible_cpu(i) { | ||
5560 | snprintf(buf, 32, "cpu%d", i); | ||
5561 | entry->procname = kstrdup(buf, GFP_KERNEL); | ||
5562 | entry->mode = 0555; | ||
5563 | entry->child = sd_alloc_ctl_cpu_table(i); | ||
5564 | entry++; | ||
5565 | } | ||
5566 | |||
5567 | WARN_ON(sd_sysctl_header); | ||
5568 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); | ||
5569 | } | ||
5570 | |||
5571 | /* may be called multiple times per register */ | ||
5572 | static void unregister_sched_domain_sysctl(void) | ||
5573 | { | ||
5574 | unregister_sysctl_table(sd_sysctl_header); | ||
5575 | sd_sysctl_header = NULL; | ||
5576 | if (sd_ctl_dir[0].child) | ||
5577 | sd_free_ctl_entry(&sd_ctl_dir[0].child); | ||
5578 | } | ||
5579 | #else | ||
5580 | static void register_sched_domain_sysctl(void) | ||
5581 | { | ||
5582 | } | ||
5583 | static void unregister_sched_domain_sysctl(void) | ||
5584 | { | ||
5585 | } | ||
5586 | #endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ | ||
5587 | |||
5588 | static void set_rq_online(struct rq *rq) | 5336 | static void set_rq_online(struct rq *rq) |
5589 | { | 5337 | { |
5590 | if (!rq->online) { | 5338 | if (!rq->online) { |
@@ -6176,11 +5924,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6176 | /* Setup the mask of cpus configured for isolated domains */ | 5924 | /* Setup the mask of cpus configured for isolated domains */ |
6177 | static int __init isolated_cpu_setup(char *str) | 5925 | static int __init isolated_cpu_setup(char *str) |
6178 | { | 5926 | { |
5927 | int ret; | ||
5928 | |||
6179 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | 5929 | alloc_bootmem_cpumask_var(&cpu_isolated_map); |
6180 | cpulist_parse(str, cpu_isolated_map); | 5930 | ret = cpulist_parse(str, cpu_isolated_map); |
5931 | if (ret) { | ||
5932 | pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); | ||
5933 | return 0; | ||
5934 | } | ||
6181 | return 1; | 5935 | return 1; |
6182 | } | 5936 | } |
6183 | |||
6184 | __setup("isolcpus=", isolated_cpu_setup); | 5937 | __setup("isolcpus=", isolated_cpu_setup); |
6185 | 5938 | ||
6186 | struct s_data { | 5939 | struct s_data { |
@@ -7863,11 +7616,9 @@ void sched_destroy_group(struct task_group *tg) | |||
7863 | void sched_offline_group(struct task_group *tg) | 7616 | void sched_offline_group(struct task_group *tg) |
7864 | { | 7617 | { |
7865 | unsigned long flags; | 7618 | unsigned long flags; |
7866 | int i; | ||
7867 | 7619 | ||
7868 | /* end participation in shares distribution */ | 7620 | /* end participation in shares distribution */ |
7869 | for_each_possible_cpu(i) | 7621 | unregister_fair_sched_group(tg); |
7870 | unregister_fair_sched_group(tg, i); | ||
7871 | 7622 | ||
7872 | spin_lock_irqsave(&task_group_lock, flags); | 7623 | spin_lock_irqsave(&task_group_lock, flags); |
7873 | list_del_rcu(&tg->list); | 7624 | list_del_rcu(&tg->list); |
@@ -7893,7 +7644,7 @@ void sched_move_task(struct task_struct *tsk) | |||
7893 | queued = task_on_rq_queued(tsk); | 7644 | queued = task_on_rq_queued(tsk); |
7894 | 7645 | ||
7895 | if (queued) | 7646 | if (queued) |
7896 | dequeue_task(rq, tsk, DEQUEUE_SAVE); | 7647 | dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); |
7897 | if (unlikely(running)) | 7648 | if (unlikely(running)) |
7898 | put_prev_task(rq, tsk); | 7649 | put_prev_task(rq, tsk); |
7899 | 7650 | ||
@@ -7917,7 +7668,7 @@ void sched_move_task(struct task_struct *tsk) | |||
7917 | if (unlikely(running)) | 7668 | if (unlikely(running)) |
7918 | tsk->sched_class->set_curr_task(rq); | 7669 | tsk->sched_class->set_curr_task(rq); |
7919 | if (queued) | 7670 | if (queued) |
7920 | enqueue_task(rq, tsk, ENQUEUE_RESTORE); | 7671 | enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); |
7921 | 7672 | ||
7922 | task_rq_unlock(rq, tsk, &flags); | 7673 | task_rq_unlock(rq, tsk, &flags); |
7923 | } | 7674 | } |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b2ab2ffb1adc..75f98c5498d5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -262,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void) | |||
262 | #ifdef CONFIG_PARAVIRT | 262 | #ifdef CONFIG_PARAVIRT |
263 | if (static_key_false(¶virt_steal_enabled)) { | 263 | if (static_key_false(¶virt_steal_enabled)) { |
264 | u64 steal; | 264 | u64 steal; |
265 | cputime_t steal_ct; | 265 | unsigned long steal_jiffies; |
266 | 266 | ||
267 | steal = paravirt_steal_clock(smp_processor_id()); | 267 | steal = paravirt_steal_clock(smp_processor_id()); |
268 | steal -= this_rq()->prev_steal_time; | 268 | steal -= this_rq()->prev_steal_time; |
269 | 269 | ||
270 | /* | 270 | /* |
271 | * cputime_t may be less precise than nsecs (eg: if it's | 271 | * steal is in nsecs but our caller is expecting steal |
272 | * based on jiffies). Lets cast the result to cputime | 272 | * time in jiffies. Lets cast the result to jiffies |
273 | * granularity and account the rest on the next rounds. | 273 | * granularity and account the rest on the next rounds. |
274 | */ | 274 | */ |
275 | steal_ct = nsecs_to_cputime(steal); | 275 | steal_jiffies = nsecs_to_jiffies(steal); |
276 | this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); | 276 | this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); |
277 | 277 | ||
278 | account_steal_time(steal_ct); | 278 | account_steal_time(jiffies_to_cputime(steal_jiffies)); |
279 | return steal_ct; | 279 | return steal_jiffies; |
280 | } | 280 | } |
281 | #endif | 281 | #endif |
282 | return false; | 282 | return false; |
@@ -668,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime | |||
668 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | 668 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
669 | 669 | ||
670 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 670 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
671 | static unsigned long long vtime_delta(struct task_struct *tsk) | 671 | static cputime_t vtime_delta(struct task_struct *tsk) |
672 | { | 672 | { |
673 | unsigned long long clock; | 673 | unsigned long now = READ_ONCE(jiffies); |
674 | 674 | ||
675 | clock = local_clock(); | 675 | if (time_before(now, (unsigned long)tsk->vtime_snap)) |
676 | if (clock < tsk->vtime_snap) | ||
677 | return 0; | 676 | return 0; |
678 | 677 | ||
679 | return clock - tsk->vtime_snap; | 678 | return jiffies_to_cputime(now - tsk->vtime_snap); |
680 | } | 679 | } |
681 | 680 | ||
682 | static cputime_t get_vtime_delta(struct task_struct *tsk) | 681 | static cputime_t get_vtime_delta(struct task_struct *tsk) |
683 | { | 682 | { |
684 | unsigned long long delta = vtime_delta(tsk); | 683 | unsigned long now = READ_ONCE(jiffies); |
684 | unsigned long delta = now - tsk->vtime_snap; | ||
685 | 685 | ||
686 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); | 686 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); |
687 | tsk->vtime_snap += delta; | 687 | tsk->vtime_snap = now; |
688 | 688 | ||
689 | /* CHECKME: always safe to convert nsecs to cputime? */ | 689 | return jiffies_to_cputime(delta); |
690 | return nsecs_to_cputime(delta); | ||
691 | } | 690 | } |
692 | 691 | ||
693 | static void __vtime_account_system(struct task_struct *tsk) | 692 | static void __vtime_account_system(struct task_struct *tsk) |
@@ -699,6 +698,9 @@ static void __vtime_account_system(struct task_struct *tsk) | |||
699 | 698 | ||
700 | void vtime_account_system(struct task_struct *tsk) | 699 | void vtime_account_system(struct task_struct *tsk) |
701 | { | 700 | { |
701 | if (!vtime_delta(tsk)) | ||
702 | return; | ||
703 | |||
702 | write_seqcount_begin(&tsk->vtime_seqcount); | 704 | write_seqcount_begin(&tsk->vtime_seqcount); |
703 | __vtime_account_system(tsk); | 705 | __vtime_account_system(tsk); |
704 | write_seqcount_end(&tsk->vtime_seqcount); | 706 | write_seqcount_end(&tsk->vtime_seqcount); |
@@ -707,7 +709,8 @@ void vtime_account_system(struct task_struct *tsk) | |||
707 | void vtime_gen_account_irq_exit(struct task_struct *tsk) | 709 | void vtime_gen_account_irq_exit(struct task_struct *tsk) |
708 | { | 710 | { |
709 | write_seqcount_begin(&tsk->vtime_seqcount); | 711 | write_seqcount_begin(&tsk->vtime_seqcount); |
710 | __vtime_account_system(tsk); | 712 | if (vtime_delta(tsk)) |
713 | __vtime_account_system(tsk); | ||
711 | if (context_tracking_in_user()) | 714 | if (context_tracking_in_user()) |
712 | tsk->vtime_snap_whence = VTIME_USER; | 715 | tsk->vtime_snap_whence = VTIME_USER; |
713 | write_seqcount_end(&tsk->vtime_seqcount); | 716 | write_seqcount_end(&tsk->vtime_seqcount); |
@@ -718,16 +721,19 @@ void vtime_account_user(struct task_struct *tsk) | |||
718 | cputime_t delta_cpu; | 721 | cputime_t delta_cpu; |
719 | 722 | ||
720 | write_seqcount_begin(&tsk->vtime_seqcount); | 723 | write_seqcount_begin(&tsk->vtime_seqcount); |
721 | delta_cpu = get_vtime_delta(tsk); | ||
722 | tsk->vtime_snap_whence = VTIME_SYS; | 724 | tsk->vtime_snap_whence = VTIME_SYS; |
723 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); | 725 | if (vtime_delta(tsk)) { |
726 | delta_cpu = get_vtime_delta(tsk); | ||
727 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); | ||
728 | } | ||
724 | write_seqcount_end(&tsk->vtime_seqcount); | 729 | write_seqcount_end(&tsk->vtime_seqcount); |
725 | } | 730 | } |
726 | 731 | ||
727 | void vtime_user_enter(struct task_struct *tsk) | 732 | void vtime_user_enter(struct task_struct *tsk) |
728 | { | 733 | { |
729 | write_seqcount_begin(&tsk->vtime_seqcount); | 734 | write_seqcount_begin(&tsk->vtime_seqcount); |
730 | __vtime_account_system(tsk); | 735 | if (vtime_delta(tsk)) |
736 | __vtime_account_system(tsk); | ||
731 | tsk->vtime_snap_whence = VTIME_USER; | 737 | tsk->vtime_snap_whence = VTIME_USER; |
732 | write_seqcount_end(&tsk->vtime_seqcount); | 738 | write_seqcount_end(&tsk->vtime_seqcount); |
733 | } | 739 | } |
@@ -742,7 +748,8 @@ void vtime_guest_enter(struct task_struct *tsk) | |||
742 | * that can thus safely catch up with a tickless delta. | 748 | * that can thus safely catch up with a tickless delta. |
743 | */ | 749 | */ |
744 | write_seqcount_begin(&tsk->vtime_seqcount); | 750 | write_seqcount_begin(&tsk->vtime_seqcount); |
745 | __vtime_account_system(tsk); | 751 | if (vtime_delta(tsk)) |
752 | __vtime_account_system(tsk); | ||
746 | current->flags |= PF_VCPU; | 753 | current->flags |= PF_VCPU; |
747 | write_seqcount_end(&tsk->vtime_seqcount); | 754 | write_seqcount_end(&tsk->vtime_seqcount); |
748 | } | 755 | } |
@@ -772,7 +779,7 @@ void arch_vtime_task_switch(struct task_struct *prev) | |||
772 | 779 | ||
773 | write_seqcount_begin(¤t->vtime_seqcount); | 780 | write_seqcount_begin(¤t->vtime_seqcount); |
774 | current->vtime_snap_whence = VTIME_SYS; | 781 | current->vtime_snap_whence = VTIME_SYS; |
775 | current->vtime_snap = sched_clock_cpu(smp_processor_id()); | 782 | current->vtime_snap = jiffies; |
776 | write_seqcount_end(¤t->vtime_seqcount); | 783 | write_seqcount_end(¤t->vtime_seqcount); |
777 | } | 784 | } |
778 | 785 | ||
@@ -783,7 +790,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) | |||
783 | local_irq_save(flags); | 790 | local_irq_save(flags); |
784 | write_seqcount_begin(&t->vtime_seqcount); | 791 | write_seqcount_begin(&t->vtime_seqcount); |
785 | t->vtime_snap_whence = VTIME_SYS; | 792 | t->vtime_snap_whence = VTIME_SYS; |
786 | t->vtime_snap = sched_clock_cpu(cpu); | 793 | t->vtime_snap = jiffies; |
787 | write_seqcount_end(&t->vtime_seqcount); | 794 | write_seqcount_end(&t->vtime_seqcount); |
788 | local_irq_restore(flags); | 795 | local_irq_restore(flags); |
789 | } | 796 | } |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 57b939c81bce..c7a036facbe1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -352,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, | |||
352 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | 352 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); |
353 | struct rq *rq = rq_of_dl_rq(dl_rq); | 353 | struct rq *rq = rq_of_dl_rq(dl_rq); |
354 | 354 | ||
355 | WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); | 355 | WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); |
356 | |||
357 | /* | ||
358 | * We are racing with the deadline timer. So, do nothing because | ||
359 | * the deadline timer handler will take care of properly recharging | ||
360 | * the runtime and postponing the deadline | ||
361 | */ | ||
362 | if (dl_se->dl_throttled) | ||
363 | return; | ||
356 | 364 | ||
357 | /* | 365 | /* |
358 | * We use the regular wall clock time to set deadlines in the | 366 | * We use the regular wall clock time to set deadlines in the |
@@ -361,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, | |||
361 | */ | 369 | */ |
362 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | 370 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; |
363 | dl_se->runtime = pi_se->dl_runtime; | 371 | dl_se->runtime = pi_se->dl_runtime; |
364 | dl_se->dl_new = 0; | ||
365 | } | 372 | } |
366 | 373 | ||
367 | /* | 374 | /* |
@@ -399,6 +406,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, | |||
399 | dl_se->runtime = pi_se->dl_runtime; | 406 | dl_se->runtime = pi_se->dl_runtime; |
400 | } | 407 | } |
401 | 408 | ||
409 | if (dl_se->dl_yielded && dl_se->runtime > 0) | ||
410 | dl_se->runtime = 0; | ||
411 | |||
402 | /* | 412 | /* |
403 | * We keep moving the deadline away until we get some | 413 | * We keep moving the deadline away until we get some |
404 | * available runtime for the entity. This ensures correct | 414 | * available runtime for the entity. This ensures correct |
@@ -500,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, | |||
500 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | 510 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); |
501 | struct rq *rq = rq_of_dl_rq(dl_rq); | 511 | struct rq *rq = rq_of_dl_rq(dl_rq); |
502 | 512 | ||
503 | /* | ||
504 | * The arrival of a new instance needs special treatment, i.e., | ||
505 | * the actual scheduling parameters have to be "renewed". | ||
506 | */ | ||
507 | if (dl_se->dl_new) { | ||
508 | setup_new_dl_entity(dl_se, pi_se); | ||
509 | return; | ||
510 | } | ||
511 | |||
512 | if (dl_time_before(dl_se->deadline, rq_clock(rq)) || | 513 | if (dl_time_before(dl_se->deadline, rq_clock(rq)) || |
513 | dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { | 514 | dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { |
514 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | 515 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; |
@@ -605,16 +606,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
605 | } | 606 | } |
606 | 607 | ||
607 | /* | 608 | /* |
608 | * This is possible if switched_from_dl() raced against a running | ||
609 | * callback that took the above !dl_task() path and we've since then | ||
610 | * switched back into SCHED_DEADLINE. | ||
611 | * | ||
612 | * There's nothing to do except drop our task reference. | ||
613 | */ | ||
614 | if (dl_se->dl_new) | ||
615 | goto unlock; | ||
616 | |||
617 | /* | ||
618 | * The task might have been boosted by someone else and might be in the | 609 | * The task might have been boosted by someone else and might be in the |
619 | * boosting/deboosting path, its not throttled. | 610 | * boosting/deboosting path, its not throttled. |
620 | */ | 611 | */ |
@@ -735,8 +726,11 @@ static void update_curr_dl(struct rq *rq) | |||
735 | * approach need further study. | 726 | * approach need further study. |
736 | */ | 727 | */ |
737 | delta_exec = rq_clock_task(rq) - curr->se.exec_start; | 728 | delta_exec = rq_clock_task(rq) - curr->se.exec_start; |
738 | if (unlikely((s64)delta_exec <= 0)) | 729 | if (unlikely((s64)delta_exec <= 0)) { |
730 | if (unlikely(dl_se->dl_yielded)) | ||
731 | goto throttle; | ||
739 | return; | 732 | return; |
733 | } | ||
740 | 734 | ||
741 | schedstat_set(curr->se.statistics.exec_max, | 735 | schedstat_set(curr->se.statistics.exec_max, |
742 | max(curr->se.statistics.exec_max, delta_exec)); | 736 | max(curr->se.statistics.exec_max, delta_exec)); |
@@ -749,8 +743,10 @@ static void update_curr_dl(struct rq *rq) | |||
749 | 743 | ||
750 | sched_rt_avg_update(rq, delta_exec); | 744 | sched_rt_avg_update(rq, delta_exec); |
751 | 745 | ||
752 | dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; | 746 | dl_se->runtime -= delta_exec; |
753 | if (dl_runtime_exceeded(dl_se)) { | 747 | |
748 | throttle: | ||
749 | if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { | ||
754 | dl_se->dl_throttled = 1; | 750 | dl_se->dl_throttled = 1; |
755 | __dequeue_task_dl(rq, curr, 0); | 751 | __dequeue_task_dl(rq, curr, 0); |
756 | if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) | 752 | if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) |
@@ -917,7 +913,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, | |||
917 | * parameters of the task might need updating. Otherwise, | 913 | * parameters of the task might need updating. Otherwise, |
918 | * we want a replenishment of its runtime. | 914 | * we want a replenishment of its runtime. |
919 | */ | 915 | */ |
920 | if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) | 916 | if (flags & ENQUEUE_WAKEUP) |
921 | update_dl_entity(dl_se, pi_se); | 917 | update_dl_entity(dl_se, pi_se); |
922 | else if (flags & ENQUEUE_REPLENISH) | 918 | else if (flags & ENQUEUE_REPLENISH) |
923 | replenish_dl_entity(dl_se, pi_se); | 919 | replenish_dl_entity(dl_se, pi_se); |
@@ -994,18 +990,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) | |||
994 | */ | 990 | */ |
995 | static void yield_task_dl(struct rq *rq) | 991 | static void yield_task_dl(struct rq *rq) |
996 | { | 992 | { |
997 | struct task_struct *p = rq->curr; | ||
998 | |||
999 | /* | 993 | /* |
1000 | * We make the task go to sleep until its current deadline by | 994 | * We make the task go to sleep until its current deadline by |
1001 | * forcing its runtime to zero. This way, update_curr_dl() stops | 995 | * forcing its runtime to zero. This way, update_curr_dl() stops |
1002 | * it and the bandwidth timer will wake it up and will give it | 996 | * it and the bandwidth timer will wake it up and will give it |
1003 | * new scheduling parameters (thanks to dl_yielded=1). | 997 | * new scheduling parameters (thanks to dl_yielded=1). |
1004 | */ | 998 | */ |
1005 | if (p->dl.runtime > 0) { | 999 | rq->curr->dl.dl_yielded = 1; |
1006 | rq->curr->dl.dl_yielded = 1; | 1000 | |
1007 | p->dl.runtime = 0; | ||
1008 | } | ||
1009 | update_rq_clock(rq); | 1001 | update_rq_clock(rq); |
1010 | update_curr_dl(rq); | 1002 | update_curr_dl(rq); |
1011 | /* | 1003 | /* |
@@ -1722,6 +1714,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) | |||
1722 | */ | 1714 | */ |
1723 | static void switched_to_dl(struct rq *rq, struct task_struct *p) | 1715 | static void switched_to_dl(struct rq *rq, struct task_struct *p) |
1724 | { | 1716 | { |
1717 | if (dl_time_before(p->dl.deadline, rq_clock(rq))) | ||
1718 | setup_new_dl_entity(&p->dl, &p->dl); | ||
1719 | |||
1725 | if (task_on_rq_queued(p) && rq->curr != p) { | 1720 | if (task_on_rq_queued(p) && rq->curr != p) { |
1726 | #ifdef CONFIG_SMP | 1721 | #ifdef CONFIG_SMP |
1727 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) | 1722 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) |
@@ -1768,8 +1763,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, | |||
1768 | */ | 1763 | */ |
1769 | resched_curr(rq); | 1764 | resched_curr(rq); |
1770 | #endif /* CONFIG_SMP */ | 1765 | #endif /* CONFIG_SMP */ |
1771 | } else | 1766 | } |
1772 | switched_to_dl(rq, p); | ||
1773 | } | 1767 | } |
1774 | 1768 | ||
1775 | const struct sched_class dl_sched_class = { | 1769 | const struct sched_class dl_sched_class = { |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 641511771ae6..4fbc3bd5ff60 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/kallsyms.h> | 16 | #include <linux/kallsyms.h> |
17 | #include <linux/utsname.h> | 17 | #include <linux/utsname.h> |
18 | #include <linux/mempolicy.h> | 18 | #include <linux/mempolicy.h> |
19 | #include <linux/debugfs.h> | ||
19 | 20 | ||
20 | #include "sched.h" | 21 | #include "sched.h" |
21 | 22 | ||
@@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec) | |||
58 | 59 | ||
59 | #define SPLIT_NS(x) nsec_high(x), nsec_low(x) | 60 | #define SPLIT_NS(x) nsec_high(x), nsec_low(x) |
60 | 61 | ||
62 | #define SCHED_FEAT(name, enabled) \ | ||
63 | #name , | ||
64 | |||
65 | static const char * const sched_feat_names[] = { | ||
66 | #include "features.h" | ||
67 | }; | ||
68 | |||
69 | #undef SCHED_FEAT | ||
70 | |||
71 | static int sched_feat_show(struct seq_file *m, void *v) | ||
72 | { | ||
73 | int i; | ||
74 | |||
75 | for (i = 0; i < __SCHED_FEAT_NR; i++) { | ||
76 | if (!(sysctl_sched_features & (1UL << i))) | ||
77 | seq_puts(m, "NO_"); | ||
78 | seq_printf(m, "%s ", sched_feat_names[i]); | ||
79 | } | ||
80 | seq_puts(m, "\n"); | ||
81 | |||
82 | return 0; | ||
83 | } | ||
84 | |||
85 | #ifdef HAVE_JUMP_LABEL | ||
86 | |||
87 | #define jump_label_key__true STATIC_KEY_INIT_TRUE | ||
88 | #define jump_label_key__false STATIC_KEY_INIT_FALSE | ||
89 | |||
90 | #define SCHED_FEAT(name, enabled) \ | ||
91 | jump_label_key__##enabled , | ||
92 | |||
93 | struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { | ||
94 | #include "features.h" | ||
95 | }; | ||
96 | |||
97 | #undef SCHED_FEAT | ||
98 | |||
99 | static void sched_feat_disable(int i) | ||
100 | { | ||
101 | static_key_disable(&sched_feat_keys[i]); | ||
102 | } | ||
103 | |||
104 | static void sched_feat_enable(int i) | ||
105 | { | ||
106 | static_key_enable(&sched_feat_keys[i]); | ||
107 | } | ||
108 | #else | ||
109 | static void sched_feat_disable(int i) { }; | ||
110 | static void sched_feat_enable(int i) { }; | ||
111 | #endif /* HAVE_JUMP_LABEL */ | ||
112 | |||
113 | static int sched_feat_set(char *cmp) | ||
114 | { | ||
115 | int i; | ||
116 | int neg = 0; | ||
117 | |||
118 | if (strncmp(cmp, "NO_", 3) == 0) { | ||
119 | neg = 1; | ||
120 | cmp += 3; | ||
121 | } | ||
122 | |||
123 | for (i = 0; i < __SCHED_FEAT_NR; i++) { | ||
124 | if (strcmp(cmp, sched_feat_names[i]) == 0) { | ||
125 | if (neg) { | ||
126 | sysctl_sched_features &= ~(1UL << i); | ||
127 | sched_feat_disable(i); | ||
128 | } else { | ||
129 | sysctl_sched_features |= (1UL << i); | ||
130 | sched_feat_enable(i); | ||
131 | } | ||
132 | break; | ||
133 | } | ||
134 | } | ||
135 | |||
136 | return i; | ||
137 | } | ||
138 | |||
139 | static ssize_t | ||
140 | sched_feat_write(struct file *filp, const char __user *ubuf, | ||
141 | size_t cnt, loff_t *ppos) | ||
142 | { | ||
143 | char buf[64]; | ||
144 | char *cmp; | ||
145 | int i; | ||
146 | struct inode *inode; | ||
147 | |||
148 | if (cnt > 63) | ||
149 | cnt = 63; | ||
150 | |||
151 | if (copy_from_user(&buf, ubuf, cnt)) | ||
152 | return -EFAULT; | ||
153 | |||
154 | buf[cnt] = 0; | ||
155 | cmp = strstrip(buf); | ||
156 | |||
157 | /* Ensure the static_key remains in a consistent state */ | ||
158 | inode = file_inode(filp); | ||
159 | inode_lock(inode); | ||
160 | i = sched_feat_set(cmp); | ||
161 | inode_unlock(inode); | ||
162 | if (i == __SCHED_FEAT_NR) | ||
163 | return -EINVAL; | ||
164 | |||
165 | *ppos += cnt; | ||
166 | |||
167 | return cnt; | ||
168 | } | ||
169 | |||
170 | static int sched_feat_open(struct inode *inode, struct file *filp) | ||
171 | { | ||
172 | return single_open(filp, sched_feat_show, NULL); | ||
173 | } | ||
174 | |||
175 | static const struct file_operations sched_feat_fops = { | ||
176 | .open = sched_feat_open, | ||
177 | .write = sched_feat_write, | ||
178 | .read = seq_read, | ||
179 | .llseek = seq_lseek, | ||
180 | .release = single_release, | ||
181 | }; | ||
182 | |||
183 | static __init int sched_init_debug(void) | ||
184 | { | ||
185 | debugfs_create_file("sched_features", 0644, NULL, NULL, | ||
186 | &sched_feat_fops); | ||
187 | |||
188 | return 0; | ||
189 | } | ||
190 | late_initcall(sched_init_debug); | ||
191 | |||
192 | #ifdef CONFIG_SMP | ||
193 | |||
194 | #ifdef CONFIG_SYSCTL | ||
195 | |||
196 | static struct ctl_table sd_ctl_dir[] = { | ||
197 | { | ||
198 | .procname = "sched_domain", | ||
199 | .mode = 0555, | ||
200 | }, | ||
201 | {} | ||
202 | }; | ||
203 | |||
204 | static struct ctl_table sd_ctl_root[] = { | ||
205 | { | ||
206 | .procname = "kernel", | ||
207 | .mode = 0555, | ||
208 | .child = sd_ctl_dir, | ||
209 | }, | ||
210 | {} | ||
211 | }; | ||
212 | |||
213 | static struct ctl_table *sd_alloc_ctl_entry(int n) | ||
214 | { | ||
215 | struct ctl_table *entry = | ||
216 | kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); | ||
217 | |||
218 | return entry; | ||
219 | } | ||
220 | |||
221 | static void sd_free_ctl_entry(struct ctl_table **tablep) | ||
222 | { | ||
223 | struct ctl_table *entry; | ||
224 | |||
225 | /* | ||
226 | * In the intermediate directories, both the child directory and | ||
227 | * procname are dynamically allocated and could fail but the mode | ||
228 | * will always be set. In the lowest directory the names are | ||
229 | * static strings and all have proc handlers. | ||
230 | */ | ||
231 | for (entry = *tablep; entry->mode; entry++) { | ||
232 | if (entry->child) | ||
233 | sd_free_ctl_entry(&entry->child); | ||
234 | if (entry->proc_handler == NULL) | ||
235 | kfree(entry->procname); | ||
236 | } | ||
237 | |||
238 | kfree(*tablep); | ||
239 | *tablep = NULL; | ||
240 | } | ||
241 | |||
242 | static int min_load_idx = 0; | ||
243 | static int max_load_idx = CPU_LOAD_IDX_MAX-1; | ||
244 | |||
245 | static void | ||
246 | set_table_entry(struct ctl_table *entry, | ||
247 | const char *procname, void *data, int maxlen, | ||
248 | umode_t mode, proc_handler *proc_handler, | ||
249 | bool load_idx) | ||
250 | { | ||
251 | entry->procname = procname; | ||
252 | entry->data = data; | ||
253 | entry->maxlen = maxlen; | ||
254 | entry->mode = mode; | ||
255 | entry->proc_handler = proc_handler; | ||
256 | |||
257 | if (load_idx) { | ||
258 | entry->extra1 = &min_load_idx; | ||
259 | entry->extra2 = &max_load_idx; | ||
260 | } | ||
261 | } | ||
262 | |||
263 | static struct ctl_table * | ||
264 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | ||
265 | { | ||
266 | struct ctl_table *table = sd_alloc_ctl_entry(14); | ||
267 | |||
268 | if (table == NULL) | ||
269 | return NULL; | ||
270 | |||
271 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | ||
272 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
273 | set_table_entry(&table[1], "max_interval", &sd->max_interval, | ||
274 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
275 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, | ||
276 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
277 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, | ||
278 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
279 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, | ||
280 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
281 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, | ||
282 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
283 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, | ||
284 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
285 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, | ||
286 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
287 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | ||
288 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
289 | set_table_entry(&table[9], "cache_nice_tries", | ||
290 | &sd->cache_nice_tries, | ||
291 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
292 | set_table_entry(&table[10], "flags", &sd->flags, | ||
293 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
294 | set_table_entry(&table[11], "max_newidle_lb_cost", | ||
295 | &sd->max_newidle_lb_cost, | ||
296 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
297 | set_table_entry(&table[12], "name", sd->name, | ||
298 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); | ||
299 | /* &table[13] is terminator */ | ||
300 | |||
301 | return table; | ||
302 | } | ||
303 | |||
304 | static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) | ||
305 | { | ||
306 | struct ctl_table *entry, *table; | ||
307 | struct sched_domain *sd; | ||
308 | int domain_num = 0, i; | ||
309 | char buf[32]; | ||
310 | |||
311 | for_each_domain(cpu, sd) | ||
312 | domain_num++; | ||
313 | entry = table = sd_alloc_ctl_entry(domain_num + 1); | ||
314 | if (table == NULL) | ||
315 | return NULL; | ||
316 | |||
317 | i = 0; | ||
318 | for_each_domain(cpu, sd) { | ||
319 | snprintf(buf, 32, "domain%d", i); | ||
320 | entry->procname = kstrdup(buf, GFP_KERNEL); | ||
321 | entry->mode = 0555; | ||
322 | entry->child = sd_alloc_ctl_domain_table(sd); | ||
323 | entry++; | ||
324 | i++; | ||
325 | } | ||
326 | return table; | ||
327 | } | ||
328 | |||
329 | static struct ctl_table_header *sd_sysctl_header; | ||
330 | void register_sched_domain_sysctl(void) | ||
331 | { | ||
332 | int i, cpu_num = num_possible_cpus(); | ||
333 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); | ||
334 | char buf[32]; | ||
335 | |||
336 | WARN_ON(sd_ctl_dir[0].child); | ||
337 | sd_ctl_dir[0].child = entry; | ||
338 | |||
339 | if (entry == NULL) | ||
340 | return; | ||
341 | |||
342 | for_each_possible_cpu(i) { | ||
343 | snprintf(buf, 32, "cpu%d", i); | ||
344 | entry->procname = kstrdup(buf, GFP_KERNEL); | ||
345 | entry->mode = 0555; | ||
346 | entry->child = sd_alloc_ctl_cpu_table(i); | ||
347 | entry++; | ||
348 | } | ||
349 | |||
350 | WARN_ON(sd_sysctl_header); | ||
351 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); | ||
352 | } | ||
353 | |||
354 | /* may be called multiple times per register */ | ||
355 | void unregister_sched_domain_sysctl(void) | ||
356 | { | ||
357 | unregister_sysctl_table(sd_sysctl_header); | ||
358 | sd_sysctl_header = NULL; | ||
359 | if (sd_ctl_dir[0].child) | ||
360 | sd_free_ctl_entry(&sd_ctl_dir[0].child); | ||
361 | } | ||
362 | #endif /* CONFIG_SYSCTL */ | ||
363 | #endif /* CONFIG_SMP */ | ||
364 | |||
61 | #ifdef CONFIG_FAIR_GROUP_SCHED | 365 | #ifdef CONFIG_FAIR_GROUP_SCHED |
62 | static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) | 366 | static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) |
63 | { | 367 | { |
@@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
75 | PN(se->vruntime); | 379 | PN(se->vruntime); |
76 | PN(se->sum_exec_runtime); | 380 | PN(se->sum_exec_runtime); |
77 | #ifdef CONFIG_SCHEDSTATS | 381 | #ifdef CONFIG_SCHEDSTATS |
78 | PN(se->statistics.wait_start); | 382 | if (schedstat_enabled()) { |
79 | PN(se->statistics.sleep_start); | 383 | PN(se->statistics.wait_start); |
80 | PN(se->statistics.block_start); | 384 | PN(se->statistics.sleep_start); |
81 | PN(se->statistics.sleep_max); | 385 | PN(se->statistics.block_start); |
82 | PN(se->statistics.block_max); | 386 | PN(se->statistics.sleep_max); |
83 | PN(se->statistics.exec_max); | 387 | PN(se->statistics.block_max); |
84 | PN(se->statistics.slice_max); | 388 | PN(se->statistics.exec_max); |
85 | PN(se->statistics.wait_max); | 389 | PN(se->statistics.slice_max); |
86 | PN(se->statistics.wait_sum); | 390 | PN(se->statistics.wait_max); |
87 | P(se->statistics.wait_count); | 391 | PN(se->statistics.wait_sum); |
392 | P(se->statistics.wait_count); | ||
393 | } | ||
88 | #endif | 394 | #endif |
89 | P(se->load.weight); | 395 | P(se->load.weight); |
90 | #ifdef CONFIG_SMP | 396 | #ifdef CONFIG_SMP |
@@ -122,10 +428,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
122 | (long long)(p->nvcsw + p->nivcsw), | 428 | (long long)(p->nvcsw + p->nivcsw), |
123 | p->prio); | 429 | p->prio); |
124 | #ifdef CONFIG_SCHEDSTATS | 430 | #ifdef CONFIG_SCHEDSTATS |
125 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", | 431 | if (schedstat_enabled()) { |
126 | SPLIT_NS(p->se.statistics.wait_sum), | 432 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", |
127 | SPLIT_NS(p->se.sum_exec_runtime), | 433 | SPLIT_NS(p->se.statistics.wait_sum), |
128 | SPLIT_NS(p->se.statistics.sum_sleep_runtime)); | 434 | SPLIT_NS(p->se.sum_exec_runtime), |
435 | SPLIT_NS(p->se.statistics.sum_sleep_runtime)); | ||
436 | } | ||
129 | #else | 437 | #else |
130 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", | 438 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", |
131 | 0LL, 0L, | 439 | 0LL, 0L, |
@@ -258,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | |||
258 | 566 | ||
259 | void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) | 567 | void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) |
260 | { | 568 | { |
569 | struct dl_bw *dl_bw; | ||
570 | |||
261 | SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); | 571 | SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); |
262 | SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); | 572 | SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); |
573 | #ifdef CONFIG_SMP | ||
574 | dl_bw = &cpu_rq(cpu)->rd->dl_bw; | ||
575 | #else | ||
576 | dl_bw = &dl_rq->dl_bw; | ||
577 | #endif | ||
578 | SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); | ||
579 | SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); | ||
263 | } | 580 | } |
264 | 581 | ||
265 | extern __read_mostly int sched_clock_running; | 582 | extern __read_mostly int sched_clock_running; |
@@ -313,17 +630,18 @@ do { \ | |||
313 | #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); | 630 | #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); |
314 | #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); | 631 | #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); |
315 | 632 | ||
316 | P(yld_count); | ||
317 | |||
318 | P(sched_count); | ||
319 | P(sched_goidle); | ||
320 | #ifdef CONFIG_SMP | 633 | #ifdef CONFIG_SMP |
321 | P64(avg_idle); | 634 | P64(avg_idle); |
322 | P64(max_idle_balance_cost); | 635 | P64(max_idle_balance_cost); |
323 | #endif | 636 | #endif |
324 | 637 | ||
325 | P(ttwu_count); | 638 | if (schedstat_enabled()) { |
326 | P(ttwu_local); | 639 | P(yld_count); |
640 | P(sched_count); | ||
641 | P(sched_goidle); | ||
642 | P(ttwu_count); | ||
643 | P(ttwu_local); | ||
644 | } | ||
327 | 645 | ||
328 | #undef P | 646 | #undef P |
329 | #undef P64 | 647 | #undef P64 |
@@ -569,38 +887,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
569 | nr_switches = p->nvcsw + p->nivcsw; | 887 | nr_switches = p->nvcsw + p->nivcsw; |
570 | 888 | ||
571 | #ifdef CONFIG_SCHEDSTATS | 889 | #ifdef CONFIG_SCHEDSTATS |
572 | PN(se.statistics.sum_sleep_runtime); | ||
573 | PN(se.statistics.wait_start); | ||
574 | PN(se.statistics.sleep_start); | ||
575 | PN(se.statistics.block_start); | ||
576 | PN(se.statistics.sleep_max); | ||
577 | PN(se.statistics.block_max); | ||
578 | PN(se.statistics.exec_max); | ||
579 | PN(se.statistics.slice_max); | ||
580 | PN(se.statistics.wait_max); | ||
581 | PN(se.statistics.wait_sum); | ||
582 | P(se.statistics.wait_count); | ||
583 | PN(se.statistics.iowait_sum); | ||
584 | P(se.statistics.iowait_count); | ||
585 | P(se.nr_migrations); | 890 | P(se.nr_migrations); |
586 | P(se.statistics.nr_migrations_cold); | ||
587 | P(se.statistics.nr_failed_migrations_affine); | ||
588 | P(se.statistics.nr_failed_migrations_running); | ||
589 | P(se.statistics.nr_failed_migrations_hot); | ||
590 | P(se.statistics.nr_forced_migrations); | ||
591 | P(se.statistics.nr_wakeups); | ||
592 | P(se.statistics.nr_wakeups_sync); | ||
593 | P(se.statistics.nr_wakeups_migrate); | ||
594 | P(se.statistics.nr_wakeups_local); | ||
595 | P(se.statistics.nr_wakeups_remote); | ||
596 | P(se.statistics.nr_wakeups_affine); | ||
597 | P(se.statistics.nr_wakeups_affine_attempts); | ||
598 | P(se.statistics.nr_wakeups_passive); | ||
599 | P(se.statistics.nr_wakeups_idle); | ||
600 | 891 | ||
601 | { | 892 | if (schedstat_enabled()) { |
602 | u64 avg_atom, avg_per_cpu; | 893 | u64 avg_atom, avg_per_cpu; |
603 | 894 | ||
895 | PN(se.statistics.sum_sleep_runtime); | ||
896 | PN(se.statistics.wait_start); | ||
897 | PN(se.statistics.sleep_start); | ||
898 | PN(se.statistics.block_start); | ||
899 | PN(se.statistics.sleep_max); | ||
900 | PN(se.statistics.block_max); | ||
901 | PN(se.statistics.exec_max); | ||
902 | PN(se.statistics.slice_max); | ||
903 | PN(se.statistics.wait_max); | ||
904 | PN(se.statistics.wait_sum); | ||
905 | P(se.statistics.wait_count); | ||
906 | PN(se.statistics.iowait_sum); | ||
907 | P(se.statistics.iowait_count); | ||
908 | P(se.statistics.nr_migrations_cold); | ||
909 | P(se.statistics.nr_failed_migrations_affine); | ||
910 | P(se.statistics.nr_failed_migrations_running); | ||
911 | P(se.statistics.nr_failed_migrations_hot); | ||
912 | P(se.statistics.nr_forced_migrations); | ||
913 | P(se.statistics.nr_wakeups); | ||
914 | P(se.statistics.nr_wakeups_sync); | ||
915 | P(se.statistics.nr_wakeups_migrate); | ||
916 | P(se.statistics.nr_wakeups_local); | ||
917 | P(se.statistics.nr_wakeups_remote); | ||
918 | P(se.statistics.nr_wakeups_affine); | ||
919 | P(se.statistics.nr_wakeups_affine_attempts); | ||
920 | P(se.statistics.nr_wakeups_passive); | ||
921 | P(se.statistics.nr_wakeups_idle); | ||
922 | |||
604 | avg_atom = p->se.sum_exec_runtime; | 923 | avg_atom = p->se.sum_exec_runtime; |
605 | if (nr_switches) | 924 | if (nr_switches) |
606 | avg_atom = div64_ul(avg_atom, nr_switches); | 925 | avg_atom = div64_ul(avg_atom, nr_switches); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56b7d4b83947..33130529e9b5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -20,8 +20,8 @@ | |||
20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra | 20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #include <linux/latencytop.h> | ||
24 | #include <linux/sched.h> | 23 | #include <linux/sched.h> |
24 | #include <linux/latencytop.h> | ||
25 | #include <linux/cpumask.h> | 25 | #include <linux/cpumask.h> |
26 | #include <linux/cpuidle.h> | 26 | #include <linux/cpuidle.h> |
27 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
@@ -755,7 +755,9 @@ static void | |||
755 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | 755 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) |
756 | { | 756 | { |
757 | struct task_struct *p; | 757 | struct task_struct *p; |
758 | u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; | 758 | u64 delta; |
759 | |||
760 | delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; | ||
759 | 761 | ||
760 | if (entity_is_task(se)) { | 762 | if (entity_is_task(se)) { |
761 | p = task_of(se); | 763 | p = task_of(se); |
@@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
776 | se->statistics.wait_sum += delta; | 778 | se->statistics.wait_sum += delta; |
777 | se->statistics.wait_start = 0; | 779 | se->statistics.wait_start = 0; |
778 | } | 780 | } |
779 | #else | ||
780 | static inline void | ||
781 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
782 | { | ||
783 | } | ||
784 | |||
785 | static inline void | ||
786 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
787 | { | ||
788 | } | ||
789 | #endif | ||
790 | 781 | ||
791 | /* | 782 | /* |
792 | * Task is being enqueued - update stats: | 783 | * Task is being enqueued - update stats: |
793 | */ | 784 | */ |
794 | static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 785 | static inline void |
786 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
795 | { | 787 | { |
796 | /* | 788 | /* |
797 | * Are we enqueueing a waiting task? (for current tasks | 789 | * Are we enqueueing a waiting task? (for current tasks |
@@ -802,7 +794,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
802 | } | 794 | } |
803 | 795 | ||
804 | static inline void | 796 | static inline void |
805 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 797 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
806 | { | 798 | { |
807 | /* | 799 | /* |
808 | * Mark the end of the wait period if dequeueing a | 800 | * Mark the end of the wait period if dequeueing a |
@@ -810,8 +802,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
810 | */ | 802 | */ |
811 | if (se != cfs_rq->curr) | 803 | if (se != cfs_rq->curr) |
812 | update_stats_wait_end(cfs_rq, se); | 804 | update_stats_wait_end(cfs_rq, se); |
805 | |||
806 | if (flags & DEQUEUE_SLEEP) { | ||
807 | if (entity_is_task(se)) { | ||
808 | struct task_struct *tsk = task_of(se); | ||
809 | |||
810 | if (tsk->state & TASK_INTERRUPTIBLE) | ||
811 | se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); | ||
812 | if (tsk->state & TASK_UNINTERRUPTIBLE) | ||
813 | se->statistics.block_start = rq_clock(rq_of(cfs_rq)); | ||
814 | } | ||
815 | } | ||
816 | |||
817 | } | ||
818 | #else | ||
819 | static inline void | ||
820 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
821 | { | ||
813 | } | 822 | } |
814 | 823 | ||
824 | static inline void | ||
825 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
826 | { | ||
827 | } | ||
828 | |||
829 | static inline void | ||
830 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
831 | { | ||
832 | } | ||
833 | |||
834 | static inline void | ||
835 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | ||
836 | { | ||
837 | } | ||
838 | #endif | ||
839 | |||
815 | /* | 840 | /* |
816 | * We are picking a new current task - update its stats: | 841 | * We are picking a new current task - update its stats: |
817 | */ | 842 | */ |
@@ -907,10 +932,11 @@ struct numa_group { | |||
907 | spinlock_t lock; /* nr_tasks, tasks */ | 932 | spinlock_t lock; /* nr_tasks, tasks */ |
908 | int nr_tasks; | 933 | int nr_tasks; |
909 | pid_t gid; | 934 | pid_t gid; |
935 | int active_nodes; | ||
910 | 936 | ||
911 | struct rcu_head rcu; | 937 | struct rcu_head rcu; |
912 | nodemask_t active_nodes; | ||
913 | unsigned long total_faults; | 938 | unsigned long total_faults; |
939 | unsigned long max_faults_cpu; | ||
914 | /* | 940 | /* |
915 | * Faults_cpu is used to decide whether memory should move | 941 | * Faults_cpu is used to decide whether memory should move |
916 | * towards the CPU. As a consequence, these stats are weighted | 942 | * towards the CPU. As a consequence, these stats are weighted |
@@ -969,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) | |||
969 | group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; | 995 | group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; |
970 | } | 996 | } |
971 | 997 | ||
998 | /* | ||
999 | * A node triggering more than 1/3 as many NUMA faults as the maximum is | ||
1000 | * considered part of a numa group's pseudo-interleaving set. Migrations | ||
1001 | * between these nodes are slowed down, to allow things to settle down. | ||
1002 | */ | ||
1003 | #define ACTIVE_NODE_FRACTION 3 | ||
1004 | |||
1005 | static bool numa_is_active_node(int nid, struct numa_group *ng) | ||
1006 | { | ||
1007 | return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; | ||
1008 | } | ||
1009 | |||
972 | /* Handle placement on systems where not all nodes are directly connected. */ | 1010 | /* Handle placement on systems where not all nodes are directly connected. */ |
973 | static unsigned long score_nearby_nodes(struct task_struct *p, int nid, | 1011 | static unsigned long score_nearby_nodes(struct task_struct *p, int nid, |
974 | int maxdist, bool task) | 1012 | int maxdist, bool task) |
@@ -1118,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, | |||
1118 | return true; | 1156 | return true; |
1119 | 1157 | ||
1120 | /* | 1158 | /* |
1121 | * Do not migrate if the destination is not a node that | 1159 | * Destination node is much more heavily used than the source |
1122 | * is actively used by this numa group. | 1160 | * node? Allow migration. |
1123 | */ | 1161 | */ |
1124 | if (!node_isset(dst_nid, ng->active_nodes)) | 1162 | if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) * |
1125 | return false; | 1163 | ACTIVE_NODE_FRACTION) |
1126 | |||
1127 | /* | ||
1128 | * Source is a node that is not actively used by this | ||
1129 | * numa group, while the destination is. Migrate. | ||
1130 | */ | ||
1131 | if (!node_isset(src_nid, ng->active_nodes)) | ||
1132 | return true; | 1164 | return true; |
1133 | 1165 | ||
1134 | /* | 1166 | /* |
1135 | * Both source and destination are nodes in active | 1167 | * Distribute memory according to CPU & memory use on each node, |
1136 | * use by this numa group. Maximize memory bandwidth | 1168 | * with 3/4 hysteresis to avoid unnecessary memory migrations: |
1137 | * by migrating from more heavily used groups, to less | 1169 | * |
1138 | * heavily used ones, spreading the load around. | 1170 | * faults_cpu(dst) 3 faults_cpu(src) |
1139 | * Use a 1/4 hysteresis to avoid spurious page movement. | 1171 | * --------------- * - > --------------- |
1172 | * faults_mem(dst) 4 faults_mem(src) | ||
1140 | */ | 1173 | */ |
1141 | return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); | 1174 | return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 > |
1175 | group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; | ||
1142 | } | 1176 | } |
1143 | 1177 | ||
1144 | static unsigned long weighted_cpuload(const int cpu); | 1178 | static unsigned long weighted_cpuload(const int cpu); |
@@ -1484,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
1484 | 1518 | ||
1485 | .best_task = NULL, | 1519 | .best_task = NULL, |
1486 | .best_imp = 0, | 1520 | .best_imp = 0, |
1487 | .best_cpu = -1 | 1521 | .best_cpu = -1, |
1488 | }; | 1522 | }; |
1489 | struct sched_domain *sd; | 1523 | struct sched_domain *sd; |
1490 | unsigned long taskweight, groupweight; | 1524 | unsigned long taskweight, groupweight; |
@@ -1536,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
1536 | * multiple NUMA nodes; in order to better consolidate the group, | 1570 | * multiple NUMA nodes; in order to better consolidate the group, |
1537 | * we need to check other locations. | 1571 | * we need to check other locations. |
1538 | */ | 1572 | */ |
1539 | if (env.best_cpu == -1 || (p->numa_group && | 1573 | if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) { |
1540 | nodes_weight(p->numa_group->active_nodes) > 1)) { | ||
1541 | for_each_online_node(nid) { | 1574 | for_each_online_node(nid) { |
1542 | if (nid == env.src_nid || nid == p->numa_preferred_nid) | 1575 | if (nid == env.src_nid || nid == p->numa_preferred_nid) |
1543 | continue; | 1576 | continue; |
@@ -1572,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p) | |||
1572 | * trying for a better one later. Do not set the preferred node here. | 1605 | * trying for a better one later. Do not set the preferred node here. |
1573 | */ | 1606 | */ |
1574 | if (p->numa_group) { | 1607 | if (p->numa_group) { |
1608 | struct numa_group *ng = p->numa_group; | ||
1609 | |||
1575 | if (env.best_cpu == -1) | 1610 | if (env.best_cpu == -1) |
1576 | nid = env.src_nid; | 1611 | nid = env.src_nid; |
1577 | else | 1612 | else |
1578 | nid = env.dst_nid; | 1613 | nid = env.dst_nid; |
1579 | 1614 | ||
1580 | if (node_isset(nid, p->numa_group->active_nodes)) | 1615 | if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) |
1581 | sched_setnuma(p, env.dst_nid); | 1616 | sched_setnuma(p, env.dst_nid); |
1582 | } | 1617 | } |
1583 | 1618 | ||
@@ -1627,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
1627 | } | 1662 | } |
1628 | 1663 | ||
1629 | /* | 1664 | /* |
1630 | * Find the nodes on which the workload is actively running. We do this by | 1665 | * Find out how many nodes on the workload is actively running on. Do this by |
1631 | * tracking the nodes from which NUMA hinting faults are triggered. This can | 1666 | * tracking the nodes from which NUMA hinting faults are triggered. This can |
1632 | * be different from the set of nodes where the workload's memory is currently | 1667 | * be different from the set of nodes where the workload's memory is currently |
1633 | * located. | 1668 | * located. |
1634 | * | ||
1635 | * The bitmask is used to make smarter decisions on when to do NUMA page | ||
1636 | * migrations, To prevent flip-flopping, and excessive page migrations, nodes | ||
1637 | * are added when they cause over 6/16 of the maximum number of faults, but | ||
1638 | * only removed when they drop below 3/16. | ||
1639 | */ | 1669 | */ |
1640 | static void update_numa_active_node_mask(struct numa_group *numa_group) | 1670 | static void numa_group_count_active_nodes(struct numa_group *numa_group) |
1641 | { | 1671 | { |
1642 | unsigned long faults, max_faults = 0; | 1672 | unsigned long faults, max_faults = 0; |
1643 | int nid; | 1673 | int nid, active_nodes = 0; |
1644 | 1674 | ||
1645 | for_each_online_node(nid) { | 1675 | for_each_online_node(nid) { |
1646 | faults = group_faults_cpu(numa_group, nid); | 1676 | faults = group_faults_cpu(numa_group, nid); |
@@ -1650,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) | |||
1650 | 1680 | ||
1651 | for_each_online_node(nid) { | 1681 | for_each_online_node(nid) { |
1652 | faults = group_faults_cpu(numa_group, nid); | 1682 | faults = group_faults_cpu(numa_group, nid); |
1653 | if (!node_isset(nid, numa_group->active_nodes)) { | 1683 | if (faults * ACTIVE_NODE_FRACTION > max_faults) |
1654 | if (faults > max_faults * 6 / 16) | 1684 | active_nodes++; |
1655 | node_set(nid, numa_group->active_nodes); | ||
1656 | } else if (faults < max_faults * 3 / 16) | ||
1657 | node_clear(nid, numa_group->active_nodes); | ||
1658 | } | 1685 | } |
1686 | |||
1687 | numa_group->max_faults_cpu = max_faults; | ||
1688 | numa_group->active_nodes = active_nodes; | ||
1659 | } | 1689 | } |
1660 | 1690 | ||
1661 | /* | 1691 | /* |
@@ -1946,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p) | |||
1946 | update_task_scan_period(p, fault_types[0], fault_types[1]); | 1976 | update_task_scan_period(p, fault_types[0], fault_types[1]); |
1947 | 1977 | ||
1948 | if (p->numa_group) { | 1978 | if (p->numa_group) { |
1949 | update_numa_active_node_mask(p->numa_group); | 1979 | numa_group_count_active_nodes(p->numa_group); |
1950 | spin_unlock_irq(group_lock); | 1980 | spin_unlock_irq(group_lock); |
1951 | max_nid = preferred_group_nid(p, max_group_nid); | 1981 | max_nid = preferred_group_nid(p, max_group_nid); |
1952 | } | 1982 | } |
@@ -1990,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
1990 | return; | 2020 | return; |
1991 | 2021 | ||
1992 | atomic_set(&grp->refcount, 1); | 2022 | atomic_set(&grp->refcount, 1); |
2023 | grp->active_nodes = 1; | ||
2024 | grp->max_faults_cpu = 0; | ||
1993 | spin_lock_init(&grp->lock); | 2025 | spin_lock_init(&grp->lock); |
1994 | grp->gid = p->pid; | 2026 | grp->gid = p->pid; |
1995 | /* Second half of the array tracks nids where faults happen */ | 2027 | /* Second half of the array tracks nids where faults happen */ |
1996 | grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * | 2028 | grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * |
1997 | nr_node_ids; | 2029 | nr_node_ids; |
1998 | 2030 | ||
1999 | node_set(task_node(current), grp->active_nodes); | ||
2000 | |||
2001 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) | 2031 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) |
2002 | grp->faults[i] = p->numa_faults[i]; | 2032 | grp->faults[i] = p->numa_faults[i]; |
2003 | 2033 | ||
@@ -2111,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
2111 | bool migrated = flags & TNF_MIGRATED; | 2141 | bool migrated = flags & TNF_MIGRATED; |
2112 | int cpu_node = task_node(current); | 2142 | int cpu_node = task_node(current); |
2113 | int local = !!(flags & TNF_FAULT_LOCAL); | 2143 | int local = !!(flags & TNF_FAULT_LOCAL); |
2144 | struct numa_group *ng; | ||
2114 | int priv; | 2145 | int priv; |
2115 | 2146 | ||
2116 | if (!static_branch_likely(&sched_numa_balancing)) | 2147 | if (!static_branch_likely(&sched_numa_balancing)) |
@@ -2151,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
2151 | * actively using should be counted as local. This allows the | 2182 | * actively using should be counted as local. This allows the |
2152 | * scan rate to slow down when a workload has settled down. | 2183 | * scan rate to slow down when a workload has settled down. |
2153 | */ | 2184 | */ |
2154 | if (!priv && !local && p->numa_group && | 2185 | ng = p->numa_group; |
2155 | node_isset(cpu_node, p->numa_group->active_nodes) && | 2186 | if (!priv && !local && ng && ng->active_nodes > 1 && |
2156 | node_isset(mem_node, p->numa_group->active_nodes)) | 2187 | numa_is_active_node(cpu_node, ng) && |
2188 | numa_is_active_node(mem_node, ng)) | ||
2157 | local = 1; | 2189 | local = 1; |
2158 | 2190 | ||
2159 | task_numa_placement(p); | 2191 | task_numa_placement(p); |
@@ -3102,6 +3134,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
3102 | 3134 | ||
3103 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq); | 3135 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq); |
3104 | 3136 | ||
3137 | static inline void check_schedstat_required(void) | ||
3138 | { | ||
3139 | #ifdef CONFIG_SCHEDSTATS | ||
3140 | if (schedstat_enabled()) | ||
3141 | return; | ||
3142 | |||
3143 | /* Force schedstat enabled if a dependent tracepoint is active */ | ||
3144 | if (trace_sched_stat_wait_enabled() || | ||
3145 | trace_sched_stat_sleep_enabled() || | ||
3146 | trace_sched_stat_iowait_enabled() || | ||
3147 | trace_sched_stat_blocked_enabled() || | ||
3148 | trace_sched_stat_runtime_enabled()) { | ||
3149 | pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " | ||
3150 | "stat_blocked and stat_runtime require the " | ||
3151 | "kernel parameter schedstats=enabled or " | ||
3152 | "kernel.sched_schedstats=1\n"); | ||
3153 | } | ||
3154 | #endif | ||
3155 | } | ||
3156 | |||
3105 | static void | 3157 | static void |
3106 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 3158 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
3107 | { | 3159 | { |
@@ -3122,11 +3174,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3122 | 3174 | ||
3123 | if (flags & ENQUEUE_WAKEUP) { | 3175 | if (flags & ENQUEUE_WAKEUP) { |
3124 | place_entity(cfs_rq, se, 0); | 3176 | place_entity(cfs_rq, se, 0); |
3125 | enqueue_sleeper(cfs_rq, se); | 3177 | if (schedstat_enabled()) |
3178 | enqueue_sleeper(cfs_rq, se); | ||
3126 | } | 3179 | } |
3127 | 3180 | ||
3128 | update_stats_enqueue(cfs_rq, se); | 3181 | check_schedstat_required(); |
3129 | check_spread(cfs_rq, se); | 3182 | if (schedstat_enabled()) { |
3183 | update_stats_enqueue(cfs_rq, se); | ||
3184 | check_spread(cfs_rq, se); | ||
3185 | } | ||
3130 | if (se != cfs_rq->curr) | 3186 | if (se != cfs_rq->curr) |
3131 | __enqueue_entity(cfs_rq, se); | 3187 | __enqueue_entity(cfs_rq, se); |
3132 | se->on_rq = 1; | 3188 | se->on_rq = 1; |
@@ -3193,19 +3249,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3193 | update_curr(cfs_rq); | 3249 | update_curr(cfs_rq); |
3194 | dequeue_entity_load_avg(cfs_rq, se); | 3250 | dequeue_entity_load_avg(cfs_rq, se); |
3195 | 3251 | ||
3196 | update_stats_dequeue(cfs_rq, se); | 3252 | if (schedstat_enabled()) |
3197 | if (flags & DEQUEUE_SLEEP) { | 3253 | update_stats_dequeue(cfs_rq, se, flags); |
3198 | #ifdef CONFIG_SCHEDSTATS | ||
3199 | if (entity_is_task(se)) { | ||
3200 | struct task_struct *tsk = task_of(se); | ||
3201 | |||
3202 | if (tsk->state & TASK_INTERRUPTIBLE) | ||
3203 | se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); | ||
3204 | if (tsk->state & TASK_UNINTERRUPTIBLE) | ||
3205 | se->statistics.block_start = rq_clock(rq_of(cfs_rq)); | ||
3206 | } | ||
3207 | #endif | ||
3208 | } | ||
3209 | 3254 | ||
3210 | clear_buddies(cfs_rq, se); | 3255 | clear_buddies(cfs_rq, se); |
3211 | 3256 | ||
@@ -3279,7 +3324,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
3279 | * a CPU. So account for the time it spent waiting on the | 3324 | * a CPU. So account for the time it spent waiting on the |
3280 | * runqueue. | 3325 | * runqueue. |
3281 | */ | 3326 | */ |
3282 | update_stats_wait_end(cfs_rq, se); | 3327 | if (schedstat_enabled()) |
3328 | update_stats_wait_end(cfs_rq, se); | ||
3283 | __dequeue_entity(cfs_rq, se); | 3329 | __dequeue_entity(cfs_rq, se); |
3284 | update_load_avg(se, 1); | 3330 | update_load_avg(se, 1); |
3285 | } | 3331 | } |
@@ -3292,7 +3338,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
3292 | * least twice that of our own weight (i.e. dont track it | 3338 | * least twice that of our own weight (i.e. dont track it |
3293 | * when there are only lesser-weight tasks around): | 3339 | * when there are only lesser-weight tasks around): |
3294 | */ | 3340 | */ |
3295 | if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { | 3341 | if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { |
3296 | se->statistics.slice_max = max(se->statistics.slice_max, | 3342 | se->statistics.slice_max = max(se->statistics.slice_max, |
3297 | se->sum_exec_runtime - se->prev_sum_exec_runtime); | 3343 | se->sum_exec_runtime - se->prev_sum_exec_runtime); |
3298 | } | 3344 | } |
@@ -3375,9 +3421,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
3375 | /* throttle cfs_rqs exceeding runtime */ | 3421 | /* throttle cfs_rqs exceeding runtime */ |
3376 | check_cfs_rq_runtime(cfs_rq); | 3422 | check_cfs_rq_runtime(cfs_rq); |
3377 | 3423 | ||
3378 | check_spread(cfs_rq, prev); | 3424 | if (schedstat_enabled()) { |
3425 | check_spread(cfs_rq, prev); | ||
3426 | if (prev->on_rq) | ||
3427 | update_stats_wait_start(cfs_rq, prev); | ||
3428 | } | ||
3429 | |||
3379 | if (prev->on_rq) { | 3430 | if (prev->on_rq) { |
3380 | update_stats_wait_start(cfs_rq, prev); | ||
3381 | /* Put 'current' back into the tree. */ | 3431 | /* Put 'current' back into the tree. */ |
3382 | __enqueue_entity(cfs_rq, prev); | 3432 | __enqueue_entity(cfs_rq, prev); |
3383 | /* in !on_rq case, update occurred at dequeue */ | 3433 | /* in !on_rq case, update occurred at dequeue */ |
@@ -4459,9 +4509,17 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | |||
4459 | 4509 | ||
4460 | /* scale is effectively 1 << i now, and >> i divides by scale */ | 4510 | /* scale is effectively 1 << i now, and >> i divides by scale */ |
4461 | 4511 | ||
4462 | old_load = this_rq->cpu_load[i] - tickless_load; | 4512 | old_load = this_rq->cpu_load[i]; |
4463 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | 4513 | old_load = decay_load_missed(old_load, pending_updates - 1, i); |
4464 | old_load += tickless_load; | 4514 | if (tickless_load) { |
4515 | old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); | ||
4516 | /* | ||
4517 | * old_load can never be a negative value because a | ||
4518 | * decayed tickless_load cannot be greater than the | ||
4519 | * original tickless_load. | ||
4520 | */ | ||
4521 | old_load += tickless_load; | ||
4522 | } | ||
4465 | new_load = this_load; | 4523 | new_load = this_load; |
4466 | /* | 4524 | /* |
4467 | * Round up the averaging division if load is increasing. This | 4525 | * Round up the averaging division if load is increasing. This |
@@ -4484,6 +4542,25 @@ static unsigned long weighted_cpuload(const int cpu) | |||
4484 | } | 4542 | } |
4485 | 4543 | ||
4486 | #ifdef CONFIG_NO_HZ_COMMON | 4544 | #ifdef CONFIG_NO_HZ_COMMON |
4545 | static void __update_cpu_load_nohz(struct rq *this_rq, | ||
4546 | unsigned long curr_jiffies, | ||
4547 | unsigned long load, | ||
4548 | int active) | ||
4549 | { | ||
4550 | unsigned long pending_updates; | ||
4551 | |||
4552 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
4553 | if (pending_updates) { | ||
4554 | this_rq->last_load_update_tick = curr_jiffies; | ||
4555 | /* | ||
4556 | * In the regular NOHZ case, we were idle, this means load 0. | ||
4557 | * In the NOHZ_FULL case, we were non-idle, we should consider | ||
4558 | * its weighted load. | ||
4559 | */ | ||
4560 | __update_cpu_load(this_rq, load, pending_updates, active); | ||
4561 | } | ||
4562 | } | ||
4563 | |||
4487 | /* | 4564 | /* |
4488 | * There is no sane way to deal with nohz on smp when using jiffies because the | 4565 | * There is no sane way to deal with nohz on smp when using jiffies because the |
4489 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | 4566 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading |
@@ -4501,22 +4578,15 @@ static unsigned long weighted_cpuload(const int cpu) | |||
4501 | * Called from nohz_idle_balance() to update the load ratings before doing the | 4578 | * Called from nohz_idle_balance() to update the load ratings before doing the |
4502 | * idle balance. | 4579 | * idle balance. |
4503 | */ | 4580 | */ |
4504 | static void update_idle_cpu_load(struct rq *this_rq) | 4581 | static void update_cpu_load_idle(struct rq *this_rq) |
4505 | { | 4582 | { |
4506 | unsigned long curr_jiffies = READ_ONCE(jiffies); | ||
4507 | unsigned long load = weighted_cpuload(cpu_of(this_rq)); | ||
4508 | unsigned long pending_updates; | ||
4509 | |||
4510 | /* | 4583 | /* |
4511 | * bail if there's load or we're actually up-to-date. | 4584 | * bail if there's load or we're actually up-to-date. |
4512 | */ | 4585 | */ |
4513 | if (load || curr_jiffies == this_rq->last_load_update_tick) | 4586 | if (weighted_cpuload(cpu_of(this_rq))) |
4514 | return; | 4587 | return; |
4515 | 4588 | ||
4516 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | 4589 | __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0); |
4517 | this_rq->last_load_update_tick = curr_jiffies; | ||
4518 | |||
4519 | __update_cpu_load(this_rq, load, pending_updates, 0); | ||
4520 | } | 4590 | } |
4521 | 4591 | ||
4522 | /* | 4592 | /* |
@@ -4527,22 +4597,12 @@ void update_cpu_load_nohz(int active) | |||
4527 | struct rq *this_rq = this_rq(); | 4597 | struct rq *this_rq = this_rq(); |
4528 | unsigned long curr_jiffies = READ_ONCE(jiffies); | 4598 | unsigned long curr_jiffies = READ_ONCE(jiffies); |
4529 | unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; | 4599 | unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; |
4530 | unsigned long pending_updates; | ||
4531 | 4600 | ||
4532 | if (curr_jiffies == this_rq->last_load_update_tick) | 4601 | if (curr_jiffies == this_rq->last_load_update_tick) |
4533 | return; | 4602 | return; |
4534 | 4603 | ||
4535 | raw_spin_lock(&this_rq->lock); | 4604 | raw_spin_lock(&this_rq->lock); |
4536 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | 4605 | __update_cpu_load_nohz(this_rq, curr_jiffies, load, active); |
4537 | if (pending_updates) { | ||
4538 | this_rq->last_load_update_tick = curr_jiffies; | ||
4539 | /* | ||
4540 | * In the regular NOHZ case, we were idle, this means load 0. | ||
4541 | * In the NOHZ_FULL case, we were non-idle, we should consider | ||
4542 | * its weighted load. | ||
4543 | */ | ||
4544 | __update_cpu_load(this_rq, load, pending_updates, active); | ||
4545 | } | ||
4546 | raw_spin_unlock(&this_rq->lock); | 4606 | raw_spin_unlock(&this_rq->lock); |
4547 | } | 4607 | } |
4548 | #endif /* CONFIG_NO_HZ */ | 4608 | #endif /* CONFIG_NO_HZ */ |
@@ -4554,7 +4614,7 @@ void update_cpu_load_active(struct rq *this_rq) | |||
4554 | { | 4614 | { |
4555 | unsigned long load = weighted_cpuload(cpu_of(this_rq)); | 4615 | unsigned long load = weighted_cpuload(cpu_of(this_rq)); |
4556 | /* | 4616 | /* |
4557 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | 4617 | * See the mess around update_cpu_load_idle() / update_cpu_load_nohz(). |
4558 | */ | 4618 | */ |
4559 | this_rq->last_load_update_tick = jiffies; | 4619 | this_rq->last_load_update_tick = jiffies; |
4560 | __update_cpu_load(this_rq, load, 1, 1); | 4620 | __update_cpu_load(this_rq, load, 1, 1); |
@@ -7848,7 +7908,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | |||
7848 | if (time_after_eq(jiffies, rq->next_balance)) { | 7908 | if (time_after_eq(jiffies, rq->next_balance)) { |
7849 | raw_spin_lock_irq(&rq->lock); | 7909 | raw_spin_lock_irq(&rq->lock); |
7850 | update_rq_clock(rq); | 7910 | update_rq_clock(rq); |
7851 | update_idle_cpu_load(rq); | 7911 | update_cpu_load_idle(rq); |
7852 | raw_spin_unlock_irq(&rq->lock); | 7912 | raw_spin_unlock_irq(&rq->lock); |
7853 | rebalance_domains(rq, CPU_IDLE); | 7913 | rebalance_domains(rq, CPU_IDLE); |
7854 | } | 7914 | } |
@@ -8234,11 +8294,8 @@ void free_fair_sched_group(struct task_group *tg) | |||
8234 | for_each_possible_cpu(i) { | 8294 | for_each_possible_cpu(i) { |
8235 | if (tg->cfs_rq) | 8295 | if (tg->cfs_rq) |
8236 | kfree(tg->cfs_rq[i]); | 8296 | kfree(tg->cfs_rq[i]); |
8237 | if (tg->se) { | 8297 | if (tg->se) |
8238 | if (tg->se[i]) | ||
8239 | remove_entity_load_avg(tg->se[i]); | ||
8240 | kfree(tg->se[i]); | 8298 | kfree(tg->se[i]); |
8241 | } | ||
8242 | } | 8299 | } |
8243 | 8300 | ||
8244 | kfree(tg->cfs_rq); | 8301 | kfree(tg->cfs_rq); |
@@ -8286,21 +8343,29 @@ err: | |||
8286 | return 0; | 8343 | return 0; |
8287 | } | 8344 | } |
8288 | 8345 | ||
8289 | void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8346 | void unregister_fair_sched_group(struct task_group *tg) |
8290 | { | 8347 | { |
8291 | struct rq *rq = cpu_rq(cpu); | ||
8292 | unsigned long flags; | 8348 | unsigned long flags; |
8349 | struct rq *rq; | ||
8350 | int cpu; | ||
8293 | 8351 | ||
8294 | /* | 8352 | for_each_possible_cpu(cpu) { |
8295 | * Only empty task groups can be destroyed; so we can speculatively | 8353 | if (tg->se[cpu]) |
8296 | * check on_list without danger of it being re-added. | 8354 | remove_entity_load_avg(tg->se[cpu]); |
8297 | */ | ||
8298 | if (!tg->cfs_rq[cpu]->on_list) | ||
8299 | return; | ||
8300 | 8355 | ||
8301 | raw_spin_lock_irqsave(&rq->lock, flags); | 8356 | /* |
8302 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | 8357 | * Only empty task groups can be destroyed; so we can speculatively |
8303 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 8358 | * check on_list without danger of it being re-added. |
8359 | */ | ||
8360 | if (!tg->cfs_rq[cpu]->on_list) | ||
8361 | continue; | ||
8362 | |||
8363 | rq = cpu_rq(cpu); | ||
8364 | |||
8365 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8366 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
8367 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8368 | } | ||
8304 | } | 8369 | } |
8305 | 8370 | ||
8306 | void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 8371 | void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
@@ -8382,7 +8447,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8382 | return 1; | 8447 | return 1; |
8383 | } | 8448 | } |
8384 | 8449 | ||
8385 | void unregister_fair_sched_group(struct task_group *tg, int cpu) { } | 8450 | void unregister_fair_sched_group(struct task_group *tg) { } |
8386 | 8451 | ||
8387 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8452 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8388 | 8453 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86abe0ea1..a774b4dbf291 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
58 | raw_spin_lock(&rt_b->rt_runtime_lock); | 58 | raw_spin_lock(&rt_b->rt_runtime_lock); |
59 | if (!rt_b->rt_period_active) { | 59 | if (!rt_b->rt_period_active) { |
60 | rt_b->rt_period_active = 1; | 60 | rt_b->rt_period_active = 1; |
61 | hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); | 61 | /* |
62 | * SCHED_DEADLINE updates the bandwidth, as a run away | ||
63 | * RT task with a DL task could hog a CPU. But DL does | ||
64 | * not reset the period. If a deadline task was running | ||
65 | * without an RT task running, it can cause RT tasks to | ||
66 | * throttle when they start up. Kick the timer right away | ||
67 | * to update the period. | ||
68 | */ | ||
69 | hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); | ||
62 | hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); | 70 | hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); |
63 | } | 71 | } |
64 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 72 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
@@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq); | |||
436 | 444 | ||
437 | static inline int on_rt_rq(struct sched_rt_entity *rt_se) | 445 | static inline int on_rt_rq(struct sched_rt_entity *rt_se) |
438 | { | 446 | { |
439 | return !list_empty(&rt_se->run_list); | 447 | return rt_se->on_rq; |
440 | } | 448 | } |
441 | 449 | ||
442 | #ifdef CONFIG_RT_GROUP_SCHED | 450 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | |||
482 | return rt_se->my_q; | 490 | return rt_se->my_q; |
483 | } | 491 | } |
484 | 492 | ||
485 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); | 493 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); |
486 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | 494 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); |
487 | 495 | ||
488 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | 496 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) |
489 | { | 497 | { |
@@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | |||
499 | if (!rt_se) | 507 | if (!rt_se) |
500 | enqueue_top_rt_rq(rt_rq); | 508 | enqueue_top_rt_rq(rt_rq); |
501 | else if (!on_rt_rq(rt_se)) | 509 | else if (!on_rt_rq(rt_se)) |
502 | enqueue_rt_entity(rt_se, false); | 510 | enqueue_rt_entity(rt_se, 0); |
503 | 511 | ||
504 | if (rt_rq->highest_prio.curr < curr->prio) | 512 | if (rt_rq->highest_prio.curr < curr->prio) |
505 | resched_curr(rq); | 513 | resched_curr(rq); |
@@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | |||
516 | if (!rt_se) | 524 | if (!rt_se) |
517 | dequeue_top_rt_rq(rt_rq); | 525 | dequeue_top_rt_rq(rt_rq); |
518 | else if (on_rt_rq(rt_se)) | 526 | else if (on_rt_rq(rt_se)) |
519 | dequeue_rt_entity(rt_se); | 527 | dequeue_rt_entity(rt_se, 0); |
520 | } | 528 | } |
521 | 529 | ||
522 | static inline int rt_rq_throttled(struct rt_rq *rt_rq) | 530 | static inline int rt_rq_throttled(struct rt_rq *rt_rq) |
@@ -1166,7 +1174,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
1166 | dec_rt_group(rt_se, rt_rq); | 1174 | dec_rt_group(rt_se, rt_rq); |
1167 | } | 1175 | } |
1168 | 1176 | ||
1169 | static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) | 1177 | /* |
1178 | * Change rt_se->run_list location unless SAVE && !MOVE | ||
1179 | * | ||
1180 | * assumes ENQUEUE/DEQUEUE flags match | ||
1181 | */ | ||
1182 | static inline bool move_entity(unsigned int flags) | ||
1183 | { | ||
1184 | if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) | ||
1185 | return false; | ||
1186 | |||
1187 | return true; | ||
1188 | } | ||
1189 | |||
1190 | static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) | ||
1191 | { | ||
1192 | list_del_init(&rt_se->run_list); | ||
1193 | |||
1194 | if (list_empty(array->queue + rt_se_prio(rt_se))) | ||
1195 | __clear_bit(rt_se_prio(rt_se), array->bitmap); | ||
1196 | |||
1197 | rt_se->on_list = 0; | ||
1198 | } | ||
1199 | |||
1200 | static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) | ||
1170 | { | 1201 | { |
1171 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | 1202 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
1172 | struct rt_prio_array *array = &rt_rq->active; | 1203 | struct rt_prio_array *array = &rt_rq->active; |
@@ -1179,26 +1210,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) | |||
1179 | * get throttled and the current group doesn't have any other | 1210 | * get throttled and the current group doesn't have any other |
1180 | * active members. | 1211 | * active members. |
1181 | */ | 1212 | */ |
1182 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) | 1213 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { |
1214 | if (rt_se->on_list) | ||
1215 | __delist_rt_entity(rt_se, array); | ||
1183 | return; | 1216 | return; |
1217 | } | ||
1184 | 1218 | ||
1185 | if (head) | 1219 | if (move_entity(flags)) { |
1186 | list_add(&rt_se->run_list, queue); | 1220 | WARN_ON_ONCE(rt_se->on_list); |
1187 | else | 1221 | if (flags & ENQUEUE_HEAD) |
1188 | list_add_tail(&rt_se->run_list, queue); | 1222 | list_add(&rt_se->run_list, queue); |
1189 | __set_bit(rt_se_prio(rt_se), array->bitmap); | 1223 | else |
1224 | list_add_tail(&rt_se->run_list, queue); | ||
1225 | |||
1226 | __set_bit(rt_se_prio(rt_se), array->bitmap); | ||
1227 | rt_se->on_list = 1; | ||
1228 | } | ||
1229 | rt_se->on_rq = 1; | ||
1190 | 1230 | ||
1191 | inc_rt_tasks(rt_se, rt_rq); | 1231 | inc_rt_tasks(rt_se, rt_rq); |
1192 | } | 1232 | } |
1193 | 1233 | ||
1194 | static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) | 1234 | static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) |
1195 | { | 1235 | { |
1196 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | 1236 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
1197 | struct rt_prio_array *array = &rt_rq->active; | 1237 | struct rt_prio_array *array = &rt_rq->active; |
1198 | 1238 | ||
1199 | list_del_init(&rt_se->run_list); | 1239 | if (move_entity(flags)) { |
1200 | if (list_empty(array->queue + rt_se_prio(rt_se))) | 1240 | WARN_ON_ONCE(!rt_se->on_list); |
1201 | __clear_bit(rt_se_prio(rt_se), array->bitmap); | 1241 | __delist_rt_entity(rt_se, array); |
1242 | } | ||
1243 | rt_se->on_rq = 0; | ||
1202 | 1244 | ||
1203 | dec_rt_tasks(rt_se, rt_rq); | 1245 | dec_rt_tasks(rt_se, rt_rq); |
1204 | } | 1246 | } |
@@ -1207,7 +1249,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
1207 | * Because the prio of an upper entry depends on the lower | 1249 | * Because the prio of an upper entry depends on the lower |
1208 | * entries, we must remove entries top - down. | 1250 | * entries, we must remove entries top - down. |
1209 | */ | 1251 | */ |
1210 | static void dequeue_rt_stack(struct sched_rt_entity *rt_se) | 1252 | static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) |
1211 | { | 1253 | { |
1212 | struct sched_rt_entity *back = NULL; | 1254 | struct sched_rt_entity *back = NULL; |
1213 | 1255 | ||
@@ -1220,31 +1262,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) | |||
1220 | 1262 | ||
1221 | for (rt_se = back; rt_se; rt_se = rt_se->back) { | 1263 | for (rt_se = back; rt_se; rt_se = rt_se->back) { |
1222 | if (on_rt_rq(rt_se)) | 1264 | if (on_rt_rq(rt_se)) |
1223 | __dequeue_rt_entity(rt_se); | 1265 | __dequeue_rt_entity(rt_se, flags); |
1224 | } | 1266 | } |
1225 | } | 1267 | } |
1226 | 1268 | ||
1227 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) | 1269 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) |
1228 | { | 1270 | { |
1229 | struct rq *rq = rq_of_rt_se(rt_se); | 1271 | struct rq *rq = rq_of_rt_se(rt_se); |
1230 | 1272 | ||
1231 | dequeue_rt_stack(rt_se); | 1273 | dequeue_rt_stack(rt_se, flags); |
1232 | for_each_sched_rt_entity(rt_se) | 1274 | for_each_sched_rt_entity(rt_se) |
1233 | __enqueue_rt_entity(rt_se, head); | 1275 | __enqueue_rt_entity(rt_se, flags); |
1234 | enqueue_top_rt_rq(&rq->rt); | 1276 | enqueue_top_rt_rq(&rq->rt); |
1235 | } | 1277 | } |
1236 | 1278 | ||
1237 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | 1279 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) |
1238 | { | 1280 | { |
1239 | struct rq *rq = rq_of_rt_se(rt_se); | 1281 | struct rq *rq = rq_of_rt_se(rt_se); |
1240 | 1282 | ||
1241 | dequeue_rt_stack(rt_se); | 1283 | dequeue_rt_stack(rt_se, flags); |
1242 | 1284 | ||
1243 | for_each_sched_rt_entity(rt_se) { | 1285 | for_each_sched_rt_entity(rt_se) { |
1244 | struct rt_rq *rt_rq = group_rt_rq(rt_se); | 1286 | struct rt_rq *rt_rq = group_rt_rq(rt_se); |
1245 | 1287 | ||
1246 | if (rt_rq && rt_rq->rt_nr_running) | 1288 | if (rt_rq && rt_rq->rt_nr_running) |
1247 | __enqueue_rt_entity(rt_se, false); | 1289 | __enqueue_rt_entity(rt_se, flags); |
1248 | } | 1290 | } |
1249 | enqueue_top_rt_rq(&rq->rt); | 1291 | enqueue_top_rt_rq(&rq->rt); |
1250 | } | 1292 | } |
@@ -1260,7 +1302,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
1260 | if (flags & ENQUEUE_WAKEUP) | 1302 | if (flags & ENQUEUE_WAKEUP) |
1261 | rt_se->timeout = 0; | 1303 | rt_se->timeout = 0; |
1262 | 1304 | ||
1263 | enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); | 1305 | enqueue_rt_entity(rt_se, flags); |
1264 | 1306 | ||
1265 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) | 1307 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) |
1266 | enqueue_pushable_task(rq, p); | 1308 | enqueue_pushable_task(rq, p); |
@@ -1271,7 +1313,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
1271 | struct sched_rt_entity *rt_se = &p->rt; | 1313 | struct sched_rt_entity *rt_se = &p->rt; |
1272 | 1314 | ||
1273 | update_curr_rt(rq); | 1315 | update_curr_rt(rq); |
1274 | dequeue_rt_entity(rt_se); | 1316 | dequeue_rt_entity(rt_se, flags); |
1275 | 1317 | ||
1276 | dequeue_pushable_task(rq, p); | 1318 | dequeue_pushable_task(rq, p); |
1277 | } | 1319 | } |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 10f16374df7f..ef5875fff5b7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -3,6 +3,7 @@ | |||
3 | #include <linux/sched/sysctl.h> | 3 | #include <linux/sched/sysctl.h> |
4 | #include <linux/sched/rt.h> | 4 | #include <linux/sched/rt.h> |
5 | #include <linux/sched/deadline.h> | 5 | #include <linux/sched/deadline.h> |
6 | #include <linux/binfmts.h> | ||
6 | #include <linux/mutex.h> | 7 | #include <linux/mutex.h> |
7 | #include <linux/spinlock.h> | 8 | #include <linux/spinlock.h> |
8 | #include <linux/stop_machine.h> | 9 | #include <linux/stop_machine.h> |
@@ -313,12 +314,11 @@ extern int tg_nop(struct task_group *tg, void *data); | |||
313 | 314 | ||
314 | extern void free_fair_sched_group(struct task_group *tg); | 315 | extern void free_fair_sched_group(struct task_group *tg); |
315 | extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); | 316 | extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); |
316 | extern void unregister_fair_sched_group(struct task_group *tg, int cpu); | 317 | extern void unregister_fair_sched_group(struct task_group *tg); |
317 | extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 318 | extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
318 | struct sched_entity *se, int cpu, | 319 | struct sched_entity *se, int cpu, |
319 | struct sched_entity *parent); | 320 | struct sched_entity *parent); |
320 | extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); | 321 | extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); |
321 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); | ||
322 | 322 | ||
323 | extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); | 323 | extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); |
324 | extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); | 324 | extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); |
@@ -909,6 +909,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group) | |||
909 | 909 | ||
910 | extern int group_balance_cpu(struct sched_group *sg); | 910 | extern int group_balance_cpu(struct sched_group *sg); |
911 | 911 | ||
912 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | ||
913 | void register_sched_domain_sysctl(void); | ||
914 | void unregister_sched_domain_sysctl(void); | ||
915 | #else | ||
916 | static inline void register_sched_domain_sysctl(void) | ||
917 | { | ||
918 | } | ||
919 | static inline void unregister_sched_domain_sysctl(void) | ||
920 | { | ||
921 | } | ||
922 | #endif | ||
923 | |||
912 | #else | 924 | #else |
913 | 925 | ||
914 | static inline void sched_ttwu_pending(void) { } | 926 | static inline void sched_ttwu_pending(void) { } |
@@ -1022,6 +1034,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; | |||
1022 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ | 1034 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ |
1023 | 1035 | ||
1024 | extern struct static_key_false sched_numa_balancing; | 1036 | extern struct static_key_false sched_numa_balancing; |
1037 | extern struct static_key_false sched_schedstats; | ||
1025 | 1038 | ||
1026 | static inline u64 global_rt_period(void) | 1039 | static inline u64 global_rt_period(void) |
1027 | { | 1040 | { |
@@ -1130,18 +1143,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
1130 | extern const int sched_prio_to_weight[40]; | 1143 | extern const int sched_prio_to_weight[40]; |
1131 | extern const u32 sched_prio_to_wmult[40]; | 1144 | extern const u32 sched_prio_to_wmult[40]; |
1132 | 1145 | ||
1146 | /* | ||
1147 | * {de,en}queue flags: | ||
1148 | * | ||
1149 | * DEQUEUE_SLEEP - task is no longer runnable | ||
1150 | * ENQUEUE_WAKEUP - task just became runnable | ||
1151 | * | ||
1152 | * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks | ||
1153 | * are in a known state which allows modification. Such pairs | ||
1154 | * should preserve as much state as possible. | ||
1155 | * | ||
1156 | * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location | ||
1157 | * in the runqueue. | ||
1158 | * | ||
1159 | * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) | ||
1160 | * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) | ||
1161 | * ENQUEUE_WAKING - sched_class::task_waking was called | ||
1162 | * | ||
1163 | */ | ||
1164 | |||
1165 | #define DEQUEUE_SLEEP 0x01 | ||
1166 | #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ | ||
1167 | #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ | ||
1168 | |||
1133 | #define ENQUEUE_WAKEUP 0x01 | 1169 | #define ENQUEUE_WAKEUP 0x01 |
1134 | #define ENQUEUE_HEAD 0x02 | 1170 | #define ENQUEUE_RESTORE 0x02 |
1171 | #define ENQUEUE_MOVE 0x04 | ||
1172 | |||
1173 | #define ENQUEUE_HEAD 0x08 | ||
1174 | #define ENQUEUE_REPLENISH 0x10 | ||
1135 | #ifdef CONFIG_SMP | 1175 | #ifdef CONFIG_SMP |
1136 | #define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ | 1176 | #define ENQUEUE_WAKING 0x20 |
1137 | #else | 1177 | #else |
1138 | #define ENQUEUE_WAKING 0x00 | 1178 | #define ENQUEUE_WAKING 0x00 |
1139 | #endif | 1179 | #endif |
1140 | #define ENQUEUE_REPLENISH 0x08 | ||
1141 | #define ENQUEUE_RESTORE 0x10 | ||
1142 | |||
1143 | #define DEQUEUE_SLEEP 0x01 | ||
1144 | #define DEQUEUE_SAVE 0x02 | ||
1145 | 1180 | ||
1146 | #define RETRY_TASK ((void *)-1UL) | 1181 | #define RETRY_TASK ((void *)-1UL) |
1147 | 1182 | ||
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index b0fbc7632de5..70b3b6a20fb0 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | |||
29 | if (rq) | 29 | if (rq) |
30 | rq->rq_sched_info.run_delay += delta; | 30 | rq->rq_sched_info.run_delay += delta; |
31 | } | 31 | } |
32 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) | 32 | # define schedstat_enabled() static_branch_unlikely(&sched_schedstats) |
33 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) | 33 | # define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) |
34 | # define schedstat_set(var, val) do { var = (val); } while (0) | 34 | # define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) |
35 | # define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) | ||
35 | #else /* !CONFIG_SCHEDSTATS */ | 36 | #else /* !CONFIG_SCHEDSTATS */ |
36 | static inline void | 37 | static inline void |
37 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | 38 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) |
@@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | |||
42 | static inline void | 43 | static inline void |
43 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) | 44 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) |
44 | {} | 45 | {} |
46 | # define schedstat_enabled() 0 | ||
45 | # define schedstat_inc(rq, field) do { } while (0) | 47 | # define schedstat_inc(rq, field) do { } while (0) |
46 | # define schedstat_add(rq, field, amt) do { } while (0) | 48 | # define schedstat_add(rq, field, amt) do { } while (0) |
47 | # define schedstat_set(var, val) do { } while (0) | 49 | # define schedstat_set(var, val) do { } while (0) |
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c new file mode 100644 index 000000000000..82f0dff90030 --- /dev/null +++ b/kernel/sched/swait.c | |||
@@ -0,0 +1,123 @@ | |||
1 | #include <linux/sched.h> | ||
2 | #include <linux/swait.h> | ||
3 | |||
4 | void __init_swait_queue_head(struct swait_queue_head *q, const char *name, | ||
5 | struct lock_class_key *key) | ||
6 | { | ||
7 | raw_spin_lock_init(&q->lock); | ||
8 | lockdep_set_class_and_name(&q->lock, key, name); | ||
9 | INIT_LIST_HEAD(&q->task_list); | ||
10 | } | ||
11 | EXPORT_SYMBOL(__init_swait_queue_head); | ||
12 | |||
13 | /* | ||
14 | * The thing about the wake_up_state() return value; I think we can ignore it. | ||
15 | * | ||
16 | * If for some reason it would return 0, that means the previously waiting | ||
17 | * task is already running, so it will observe condition true (or has already). | ||
18 | */ | ||
19 | void swake_up_locked(struct swait_queue_head *q) | ||
20 | { | ||
21 | struct swait_queue *curr; | ||
22 | |||
23 | if (list_empty(&q->task_list)) | ||
24 | return; | ||
25 | |||
26 | curr = list_first_entry(&q->task_list, typeof(*curr), task_list); | ||
27 | wake_up_process(curr->task); | ||
28 | list_del_init(&curr->task_list); | ||
29 | } | ||
30 | EXPORT_SYMBOL(swake_up_locked); | ||
31 | |||
32 | void swake_up(struct swait_queue_head *q) | ||
33 | { | ||
34 | unsigned long flags; | ||
35 | |||
36 | if (!swait_active(q)) | ||
37 | return; | ||
38 | |||
39 | raw_spin_lock_irqsave(&q->lock, flags); | ||
40 | swake_up_locked(q); | ||
41 | raw_spin_unlock_irqrestore(&q->lock, flags); | ||
42 | } | ||
43 | EXPORT_SYMBOL(swake_up); | ||
44 | |||
45 | /* | ||
46 | * Does not allow usage from IRQ disabled, since we must be able to | ||
47 | * release IRQs to guarantee bounded hold time. | ||
48 | */ | ||
49 | void swake_up_all(struct swait_queue_head *q) | ||
50 | { | ||
51 | struct swait_queue *curr; | ||
52 | LIST_HEAD(tmp); | ||
53 | |||
54 | if (!swait_active(q)) | ||
55 | return; | ||
56 | |||
57 | raw_spin_lock_irq(&q->lock); | ||
58 | list_splice_init(&q->task_list, &tmp); | ||
59 | while (!list_empty(&tmp)) { | ||
60 | curr = list_first_entry(&tmp, typeof(*curr), task_list); | ||
61 | |||
62 | wake_up_state(curr->task, TASK_NORMAL); | ||
63 | list_del_init(&curr->task_list); | ||
64 | |||
65 | if (list_empty(&tmp)) | ||
66 | break; | ||
67 | |||
68 | raw_spin_unlock_irq(&q->lock); | ||
69 | raw_spin_lock_irq(&q->lock); | ||
70 | } | ||
71 | raw_spin_unlock_irq(&q->lock); | ||
72 | } | ||
73 | EXPORT_SYMBOL(swake_up_all); | ||
74 | |||
75 | void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) | ||
76 | { | ||
77 | wait->task = current; | ||
78 | if (list_empty(&wait->task_list)) | ||
79 | list_add(&wait->task_list, &q->task_list); | ||
80 | } | ||
81 | |||
82 | void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) | ||
83 | { | ||
84 | unsigned long flags; | ||
85 | |||
86 | raw_spin_lock_irqsave(&q->lock, flags); | ||
87 | __prepare_to_swait(q, wait); | ||
88 | set_current_state(state); | ||
89 | raw_spin_unlock_irqrestore(&q->lock, flags); | ||
90 | } | ||
91 | EXPORT_SYMBOL(prepare_to_swait); | ||
92 | |||
93 | long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) | ||
94 | { | ||
95 | if (signal_pending_state(state, current)) | ||
96 | return -ERESTARTSYS; | ||
97 | |||
98 | prepare_to_swait(q, wait, state); | ||
99 | |||
100 | return 0; | ||
101 | } | ||
102 | EXPORT_SYMBOL(prepare_to_swait_event); | ||
103 | |||
104 | void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait) | ||
105 | { | ||
106 | __set_current_state(TASK_RUNNING); | ||
107 | if (!list_empty(&wait->task_list)) | ||
108 | list_del_init(&wait->task_list); | ||
109 | } | ||
110 | |||
111 | void finish_swait(struct swait_queue_head *q, struct swait_queue *wait) | ||
112 | { | ||
113 | unsigned long flags; | ||
114 | |||
115 | __set_current_state(TASK_RUNNING); | ||
116 | |||
117 | if (!list_empty_careful(&wait->task_list)) { | ||
118 | raw_spin_lock_irqsave(&q->lock, flags); | ||
119 | list_del_init(&wait->task_list); | ||
120 | raw_spin_unlock_irqrestore(&q->lock, flags); | ||
121 | } | ||
122 | } | ||
123 | EXPORT_SYMBOL(finish_swait); | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c index 479e4436f787..8aae49dd7da8 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) | |||
116 | 116 | ||
117 | if (preempt_count() == cnt) { | 117 | if (preempt_count() == cnt) { |
118 | #ifdef CONFIG_DEBUG_PREEMPT | 118 | #ifdef CONFIG_DEBUG_PREEMPT |
119 | current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1); | 119 | current->preempt_disable_ip = get_lock_parent_ip(); |
120 | #endif | 120 | #endif |
121 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 121 | trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip()); |
122 | } | 122 | } |
123 | } | 123 | } |
124 | EXPORT_SYMBOL(__local_bh_disable_ip); | 124 | EXPORT_SYMBOL(__local_bh_disable_ip); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 97715fd9e790..f5102fabef7f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = { | |||
350 | .mode = 0644, | 350 | .mode = 0644, |
351 | .proc_handler = proc_dointvec, | 351 | .proc_handler = proc_dointvec, |
352 | }, | 352 | }, |
353 | #ifdef CONFIG_SCHEDSTATS | ||
354 | { | ||
355 | .procname = "sched_schedstats", | ||
356 | .data = NULL, | ||
357 | .maxlen = sizeof(unsigned int), | ||
358 | .mode = 0644, | ||
359 | .proc_handler = sysctl_schedstats, | ||
360 | .extra1 = &zero, | ||
361 | .extra2 = &one, | ||
362 | }, | ||
363 | #endif /* CONFIG_SCHEDSTATS */ | ||
353 | #endif /* CONFIG_SMP */ | 364 | #endif /* CONFIG_SMP */ |
354 | #ifdef CONFIG_NUMA_BALANCING | 365 | #ifdef CONFIG_NUMA_BALANCING |
355 | { | 366 | { |
@@ -505,7 +516,7 @@ static struct ctl_table kern_table[] = { | |||
505 | .data = &latencytop_enabled, | 516 | .data = &latencytop_enabled, |
506 | .maxlen = sizeof(int), | 517 | .maxlen = sizeof(int), |
507 | .mode = 0644, | 518 | .mode = 0644, |
508 | .proc_handler = proc_dointvec, | 519 | .proc_handler = sysctl_latencytop, |
509 | }, | 520 | }, |
510 | #endif | 521 | #endif |
511 | #ifdef CONFIG_BLK_DEV_INITRD | 522 | #ifdef CONFIG_BLK_DEV_INITRD |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 975cb49e32bf..f8e26ab963ed 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) | |||
93 | { | 93 | { |
94 | struct mm_struct *mm; | 94 | struct mm_struct *mm; |
95 | 95 | ||
96 | /* convert pages-usec to Mbyte-usec */ | 96 | /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */ |
97 | stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB; | 97 | stats->coremem = p->acct_rss_mem1 * PAGE_SIZE; |
98 | stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB; | 98 | do_div(stats->coremem, 1000 * KB); |
99 | stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE; | ||
100 | do_div(stats->virtmem, 1000 * KB); | ||
99 | mm = get_task_mm(p); | 101 | mm = get_task_mm(p); |
100 | if (mm) { | 102 | if (mm) { |
101 | /* adjust to KB unit */ | 103 | /* adjust to KB unit */ |
@@ -123,27 +125,28 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) | |||
123 | static void __acct_update_integrals(struct task_struct *tsk, | 125 | static void __acct_update_integrals(struct task_struct *tsk, |
124 | cputime_t utime, cputime_t stime) | 126 | cputime_t utime, cputime_t stime) |
125 | { | 127 | { |
126 | if (likely(tsk->mm)) { | 128 | cputime_t time, dtime; |
127 | cputime_t time, dtime; | 129 | u64 delta; |
128 | struct timeval value; | 130 | |
129 | unsigned long flags; | 131 | if (!likely(tsk->mm)) |
130 | u64 delta; | 132 | return; |
131 | 133 | ||
132 | local_irq_save(flags); | 134 | time = stime + utime; |
133 | time = stime + utime; | 135 | dtime = time - tsk->acct_timexpd; |
134 | dtime = time - tsk->acct_timexpd; | 136 | /* Avoid division: cputime_t is often in nanoseconds already. */ |
135 | jiffies_to_timeval(cputime_to_jiffies(dtime), &value); | 137 | delta = cputime_to_nsecs(dtime); |
136 | delta = value.tv_sec; | 138 | |
137 | delta = delta * USEC_PER_SEC + value.tv_usec; | 139 | if (delta < TICK_NSEC) |
138 | 140 | return; | |
139 | if (delta == 0) | 141 | |
140 | goto out; | 142 | tsk->acct_timexpd = time; |
141 | tsk->acct_timexpd = time; | 143 | /* |
142 | tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); | 144 | * Divide by 1024 to avoid overflow, and to avoid division. |
143 | tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; | 145 | * The final unit reported to userspace is Mbyte-usecs, |
144 | out: | 146 | * the rest of the math is done in xacct_add_tsk. |
145 | local_irq_restore(flags); | 147 | */ |
146 | } | 148 | tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; |
149 | tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; | ||
147 | } | 150 | } |
148 | 151 | ||
149 | /** | 152 | /** |
@@ -153,9 +156,12 @@ static void __acct_update_integrals(struct task_struct *tsk, | |||
153 | void acct_update_integrals(struct task_struct *tsk) | 156 | void acct_update_integrals(struct task_struct *tsk) |
154 | { | 157 | { |
155 | cputime_t utime, stime; | 158 | cputime_t utime, stime; |
159 | unsigned long flags; | ||
156 | 160 | ||
161 | local_irq_save(flags); | ||
157 | task_cputime(tsk, &utime, &stime); | 162 | task_cputime(tsk, &utime, &stime); |
158 | __acct_update_integrals(tsk, utime, stime); | 163 | __acct_update_integrals(tsk, utime, stime); |
164 | local_irq_restore(flags); | ||
159 | } | 165 | } |
160 | 166 | ||
161 | /** | 167 | /** |
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index db2dd3335c6a..65da997b430a 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c | |||
@@ -97,8 +97,8 @@ static void async_pf_execute(struct work_struct *work) | |||
97 | * This memory barrier pairs with prepare_to_wait's set_current_state() | 97 | * This memory barrier pairs with prepare_to_wait's set_current_state() |
98 | */ | 98 | */ |
99 | smp_mb(); | 99 | smp_mb(); |
100 | if (waitqueue_active(&vcpu->wq)) | 100 | if (swait_active(&vcpu->wq)) |
101 | wake_up_interruptible(&vcpu->wq); | 101 | swake_up(&vcpu->wq); |
102 | 102 | ||
103 | mmput(mm); | 103 | mmput(mm); |
104 | kvm_put_kvm(vcpu->kvm); | 104 | kvm_put_kvm(vcpu->kvm); |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9102ae172d2a..5af50c3ddd53 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
@@ -216,8 +216,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) | |||
216 | vcpu->kvm = kvm; | 216 | vcpu->kvm = kvm; |
217 | vcpu->vcpu_id = id; | 217 | vcpu->vcpu_id = id; |
218 | vcpu->pid = NULL; | 218 | vcpu->pid = NULL; |
219 | vcpu->halt_poll_ns = 0; | 219 | init_swait_queue_head(&vcpu->wq); |
220 | init_waitqueue_head(&vcpu->wq); | ||
221 | kvm_async_pf_vcpu_init(vcpu); | 220 | kvm_async_pf_vcpu_init(vcpu); |
222 | 221 | ||
223 | vcpu->pre_pcpu = -1; | 222 | vcpu->pre_pcpu = -1; |
@@ -1993,7 +1992,7 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) | |||
1993 | void kvm_vcpu_block(struct kvm_vcpu *vcpu) | 1992 | void kvm_vcpu_block(struct kvm_vcpu *vcpu) |
1994 | { | 1993 | { |
1995 | ktime_t start, cur; | 1994 | ktime_t start, cur; |
1996 | DEFINE_WAIT(wait); | 1995 | DECLARE_SWAITQUEUE(wait); |
1997 | bool waited = false; | 1996 | bool waited = false; |
1998 | u64 block_ns; | 1997 | u64 block_ns; |
1999 | 1998 | ||
@@ -2018,7 +2017,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) | |||
2018 | kvm_arch_vcpu_blocking(vcpu); | 2017 | kvm_arch_vcpu_blocking(vcpu); |
2019 | 2018 | ||
2020 | for (;;) { | 2019 | for (;;) { |
2021 | prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); | 2020 | prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); |
2022 | 2021 | ||
2023 | if (kvm_vcpu_check_block(vcpu) < 0) | 2022 | if (kvm_vcpu_check_block(vcpu) < 0) |
2024 | break; | 2023 | break; |
@@ -2027,7 +2026,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) | |||
2027 | schedule(); | 2026 | schedule(); |
2028 | } | 2027 | } |
2029 | 2028 | ||
2030 | finish_wait(&vcpu->wq, &wait); | 2029 | finish_swait(&vcpu->wq, &wait); |
2031 | cur = ktime_get(); | 2030 | cur = ktime_get(); |
2032 | 2031 | ||
2033 | kvm_arch_vcpu_unblocking(vcpu); | 2032 | kvm_arch_vcpu_unblocking(vcpu); |
@@ -2059,11 +2058,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) | |||
2059 | { | 2058 | { |
2060 | int me; | 2059 | int me; |
2061 | int cpu = vcpu->cpu; | 2060 | int cpu = vcpu->cpu; |
2062 | wait_queue_head_t *wqp; | 2061 | struct swait_queue_head *wqp; |
2063 | 2062 | ||
2064 | wqp = kvm_arch_vcpu_wq(vcpu); | 2063 | wqp = kvm_arch_vcpu_wq(vcpu); |
2065 | if (waitqueue_active(wqp)) { | 2064 | if (swait_active(wqp)) { |
2066 | wake_up_interruptible(wqp); | 2065 | swake_up(wqp); |
2067 | ++vcpu->stat.halt_wakeup; | 2066 | ++vcpu->stat.halt_wakeup; |
2068 | } | 2067 | } |
2069 | 2068 | ||
@@ -2164,7 +2163,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) | |||
2164 | continue; | 2163 | continue; |
2165 | if (vcpu == me) | 2164 | if (vcpu == me) |
2166 | continue; | 2165 | continue; |
2167 | if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) | 2166 | if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) |
2168 | continue; | 2167 | continue; |
2169 | if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) | 2168 | if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) |
2170 | continue; | 2169 | continue; |