aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-14 22:14:06 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-14 22:14:06 -0400
commitd4e796152a049f6a675f8b6dcf7080a9d80014e5 (patch)
tree3d7bff1541b4035f7fd06c7259032e616ea6b497
parentd88bfe1d68735595d57bd071294f664c4f054435 (diff)
parentf9c904b7613b8b4c85b10cd6b33ad41b2843fa9d (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes in this cycle are: - Make schedstats a runtime tunable (disabled by default) and optimize it via static keys. As most distributions enable CONFIG_SCHEDSTATS=y due to its instrumentation value, this is a nice performance enhancement. (Mel Gorman) - Implement 'simple waitqueues' (swait): these are just pure waitqueues without any of the more complex features of full-blown waitqueues (callbacks, wake flags, wake keys, etc.). Simple waitqueues have less memory overhead and are faster. Use simple waitqueues in the RCU code (in 4 different places) and for handling KVM vCPU wakeups. (Peter Zijlstra, Daniel Wagner, Thomas Gleixner, Paul Gortmaker, Marcelo Tosatti) - sched/numa enhancements (Rik van Riel) - NOHZ performance enhancements (Rik van Riel) - Various sched/deadline enhancements (Steven Rostedt) - Various fixes (Peter Zijlstra) - ... and a number of other fixes, cleanups and smaller enhancements" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (29 commits) sched/cputime: Fix steal_account_process_tick() to always return jiffies sched/deadline: Remove dl_new from struct sched_dl_entity Revert "kbuild: Add option to turn incompatible pointer check into error" sched/deadline: Remove superfluous call to switched_to_dl() sched/debug: Fix preempt_disable_ip recording for preempt_disable() sched, time: Switch VIRT_CPU_ACCOUNTING_GEN to jiffy granularity time, acct: Drop irq save & restore from __acct_update_integrals() acct, time: Change indentation in __acct_update_integrals() sched, time: Remove non-power-of-two divides from __acct_update_integrals() sched/rt: Kick RT bandwidth timer immediately on start up sched/debug: Add deadline scheduler bandwidth ratio to /proc/sched_debug sched/debug: Move sched_domain_sysctl to debug.c sched/debug: Move the /sys/kernel/debug/sched_features file setup into debug.c sched/rt: Fix PI handling vs. sched_setscheduler() sched/core: Remove duplicated sched_group_set_shares() prototype sched/fair: Consolidate nohz CPU load update code sched/fair: Avoid using decay_load_missed() with a negative value sched/deadline: Always calculate end of period on sched_yield() sched/cgroup: Fix cgroup entity load tracking tear-down rcu: Use simple wait queues where possible in rcutree ...
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--Documentation/sysctl/kernel.txt8
-rw-r--r--arch/arm/kvm/arm.c8
-rw-r--r--arch/arm/kvm/psci.c4
-rw-r--r--arch/mips/kvm/mips.c8
-rw-r--r--arch/powerpc/include/asm/kvm_host.h4
-rw-r--r--arch/powerpc/kvm/book3s_hv.c23
-rw-r--r--arch/s390/include/asm/kvm_host.h2
-rw-r--r--arch/s390/kvm/interrupt.c4
-rw-r--r--arch/x86/kvm/lapic.c6
-rw-r--r--include/linux/ftrace.h12
-rw-r--r--include/linux/kvm_host.h5
-rw-r--r--include/linux/latencytop.h3
-rw-r--r--include/linux/sched.h14
-rw-r--r--include/linux/sched/sysctl.h4
-rw-r--r--include/linux/swait.h172
-rw-r--r--include/linux/wait.h2
-rw-r--r--kernel/latencytop.c14
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/rcu/tree.c24
-rw-r--r--kernel/rcu/tree.h12
-rw-r--r--kernel/rcu/tree_plugin.h32
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c449
-rw-r--r--kernel/sched/cputime.c53
-rw-r--r--kernel/sched/deadline.c60
-rw-r--r--kernel/sched/debug.c415
-rw-r--r--kernel/sched/fair.c297
-rw-r--r--kernel/sched/rt.c96
-rw-r--r--kernel/sched/sched.h53
-rw-r--r--kernel/sched/stats.h8
-rw-r--r--kernel/sched/swait.c123
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sysctl.c13
-rw-r--r--kernel/tsacct.c54
-rw-r--r--virt/kvm/async_pf.c4
-rw-r--r--virt/kvm/kvm_main.c17
37 files changed, 1300 insertions, 715 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 000336733a6a..8ae47a7b4923 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3532,6 +3532,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
3532 3532
3533 sched_debug [KNL] Enables verbose scheduler debug messages. 3533 sched_debug [KNL] Enables verbose scheduler debug messages.
3534 3534
3535 schedstats= [KNL,X86] Enable or disable scheduled statistics.
3536 Allowed values are enable and disable. This feature
3537 incurs a small amount of overhead in the scheduler
3538 but is useful for debugging and performance tuning.
3539
3535 skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate 3540 skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate
3536 xtime_lock contention on larger systems, and/or RCU lock 3541 xtime_lock contention on larger systems, and/or RCU lock
3537 contention on all systems with CONFIG_MAXSMP set. 3542 contention on all systems with CONFIG_MAXSMP set.
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index f886fbb1ad05..f4444c94ff28 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -773,6 +773,14 @@ rtsig-nr shows the number of RT signals currently queued.
773 773
774============================================================== 774==============================================================
775 775
776sched_schedstats:
777
778Enables/disables scheduler statistics. Enabling this feature
779incurs a small amount of overhead in the scheduler but is
780useful for debugging and performance tuning.
781
782==============================================================
783
776sg-big-buff: 784sg-big-buff:
777 785
778This file shows the size of the generic SCSI (sg) buffer. 786This file shows the size of the generic SCSI (sg) buffer.
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index dda1959f0dde..08e49c423c24 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -506,18 +506,18 @@ static void kvm_arm_resume_guest(struct kvm *kvm)
506 struct kvm_vcpu *vcpu; 506 struct kvm_vcpu *vcpu;
507 507
508 kvm_for_each_vcpu(i, vcpu, kvm) { 508 kvm_for_each_vcpu(i, vcpu, kvm) {
509 wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); 509 struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
510 510
511 vcpu->arch.pause = false; 511 vcpu->arch.pause = false;
512 wake_up_interruptible(wq); 512 swake_up(wq);
513 } 513 }
514} 514}
515 515
516static void vcpu_sleep(struct kvm_vcpu *vcpu) 516static void vcpu_sleep(struct kvm_vcpu *vcpu)
517{ 517{
518 wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); 518 struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
519 519
520 wait_event_interruptible(*wq, ((!vcpu->arch.power_off) && 520 swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
521 (!vcpu->arch.pause))); 521 (!vcpu->arch.pause)));
522} 522}
523 523
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index a9b3b905e661..c2b131527a64 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -70,7 +70,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
70{ 70{
71 struct kvm *kvm = source_vcpu->kvm; 71 struct kvm *kvm = source_vcpu->kvm;
72 struct kvm_vcpu *vcpu = NULL; 72 struct kvm_vcpu *vcpu = NULL;
73 wait_queue_head_t *wq; 73 struct swait_queue_head *wq;
74 unsigned long cpu_id; 74 unsigned long cpu_id;
75 unsigned long context_id; 75 unsigned long context_id;
76 phys_addr_t target_pc; 76 phys_addr_t target_pc;
@@ -119,7 +119,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
119 smp_mb(); /* Make sure the above is visible */ 119 smp_mb(); /* Make sure the above is visible */
120 120
121 wq = kvm_arch_vcpu_wq(vcpu); 121 wq = kvm_arch_vcpu_wq(vcpu);
122 wake_up_interruptible(wq); 122 swake_up(wq);
123 123
124 return PSCI_RET_SUCCESS; 124 return PSCI_RET_SUCCESS;
125} 125}
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 3110447ab1e9..70ef1a43c114 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -445,8 +445,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
445 445
446 dvcpu->arch.wait = 0; 446 dvcpu->arch.wait = 0;
447 447
448 if (waitqueue_active(&dvcpu->wq)) 448 if (swait_active(&dvcpu->wq))
449 wake_up_interruptible(&dvcpu->wq); 449 swake_up(&dvcpu->wq);
450 450
451 return 0; 451 return 0;
452} 452}
@@ -1174,8 +1174,8 @@ static void kvm_mips_comparecount_func(unsigned long data)
1174 kvm_mips_callbacks->queue_timer_int(vcpu); 1174 kvm_mips_callbacks->queue_timer_int(vcpu);
1175 1175
1176 vcpu->arch.wait = 0; 1176 vcpu->arch.wait = 0;
1177 if (waitqueue_active(&vcpu->wq)) 1177 if (swait_active(&vcpu->wq))
1178 wake_up_interruptible(&vcpu->wq); 1178 swake_up(&vcpu->wq);
1179} 1179}
1180 1180
1181/* low level hrtimer wake routine */ 1181/* low level hrtimer wake routine */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 9d08d8cbed1a..c98afa538b3a 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -289,7 +289,7 @@ struct kvmppc_vcore {
289 struct list_head runnable_threads; 289 struct list_head runnable_threads;
290 struct list_head preempt_list; 290 struct list_head preempt_list;
291 spinlock_t lock; 291 spinlock_t lock;
292 wait_queue_head_t wq; 292 struct swait_queue_head wq;
293 spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */ 293 spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
294 u64 stolen_tb; 294 u64 stolen_tb;
295 u64 preempt_tb; 295 u64 preempt_tb;
@@ -629,7 +629,7 @@ struct kvm_vcpu_arch {
629 u8 prodded; 629 u8 prodded;
630 u32 last_inst; 630 u32 last_inst;
631 631
632 wait_queue_head_t *wqp; 632 struct swait_queue_head *wqp;
633 struct kvmppc_vcore *vcore; 633 struct kvmppc_vcore *vcore;
634 int ret; 634 int ret;
635 int trap; 635 int trap;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index baeddb06811d..f1187bb6dd4d 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -114,11 +114,11 @@ static bool kvmppc_ipi_thread(int cpu)
114static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) 114static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
115{ 115{
116 int cpu; 116 int cpu;
117 wait_queue_head_t *wqp; 117 struct swait_queue_head *wqp;
118 118
119 wqp = kvm_arch_vcpu_wq(vcpu); 119 wqp = kvm_arch_vcpu_wq(vcpu);
120 if (waitqueue_active(wqp)) { 120 if (swait_active(wqp)) {
121 wake_up_interruptible(wqp); 121 swake_up(wqp);
122 ++vcpu->stat.halt_wakeup; 122 ++vcpu->stat.halt_wakeup;
123 } 123 }
124 124
@@ -701,8 +701,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
701 tvcpu->arch.prodded = 1; 701 tvcpu->arch.prodded = 1;
702 smp_mb(); 702 smp_mb();
703 if (vcpu->arch.ceded) { 703 if (vcpu->arch.ceded) {
704 if (waitqueue_active(&vcpu->wq)) { 704 if (swait_active(&vcpu->wq)) {
705 wake_up_interruptible(&vcpu->wq); 705 swake_up(&vcpu->wq);
706 vcpu->stat.halt_wakeup++; 706 vcpu->stat.halt_wakeup++;
707 } 707 }
708 } 708 }
@@ -1459,7 +1459,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
1459 INIT_LIST_HEAD(&vcore->runnable_threads); 1459 INIT_LIST_HEAD(&vcore->runnable_threads);
1460 spin_lock_init(&vcore->lock); 1460 spin_lock_init(&vcore->lock);
1461 spin_lock_init(&vcore->stoltb_lock); 1461 spin_lock_init(&vcore->stoltb_lock);
1462 init_waitqueue_head(&vcore->wq); 1462 init_swait_queue_head(&vcore->wq);
1463 vcore->preempt_tb = TB_NIL; 1463 vcore->preempt_tb = TB_NIL;
1464 vcore->lpcr = kvm->arch.lpcr; 1464 vcore->lpcr = kvm->arch.lpcr;
1465 vcore->first_vcpuid = core * threads_per_subcore; 1465 vcore->first_vcpuid = core * threads_per_subcore;
@@ -2531,10 +2531,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2531{ 2531{
2532 struct kvm_vcpu *vcpu; 2532 struct kvm_vcpu *vcpu;
2533 int do_sleep = 1; 2533 int do_sleep = 1;
2534 DECLARE_SWAITQUEUE(wait);
2534 2535
2535 DEFINE_WAIT(wait); 2536 prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
2536
2537 prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
2538 2537
2539 /* 2538 /*
2540 * Check one last time for pending exceptions and ceded state after 2539 * Check one last time for pending exceptions and ceded state after
@@ -2548,7 +2547,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2548 } 2547 }
2549 2548
2550 if (!do_sleep) { 2549 if (!do_sleep) {
2551 finish_wait(&vc->wq, &wait); 2550 finish_swait(&vc->wq, &wait);
2552 return; 2551 return;
2553 } 2552 }
2554 2553
@@ -2556,7 +2555,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2556 trace_kvmppc_vcore_blocked(vc, 0); 2555 trace_kvmppc_vcore_blocked(vc, 0);
2557 spin_unlock(&vc->lock); 2556 spin_unlock(&vc->lock);
2558 schedule(); 2557 schedule();
2559 finish_wait(&vc->wq, &wait); 2558 finish_swait(&vc->wq, &wait);
2560 spin_lock(&vc->lock); 2559 spin_lock(&vc->lock);
2561 vc->vcore_state = VCORE_INACTIVE; 2560 vc->vcore_state = VCORE_INACTIVE;
2562 trace_kvmppc_vcore_blocked(vc, 1); 2561 trace_kvmppc_vcore_blocked(vc, 1);
@@ -2612,7 +2611,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2612 kvmppc_start_thread(vcpu, vc); 2611 kvmppc_start_thread(vcpu, vc);
2613 trace_kvm_guest_enter(vcpu); 2612 trace_kvm_guest_enter(vcpu);
2614 } else if (vc->vcore_state == VCORE_SLEEPING) { 2613 } else if (vc->vcore_state == VCORE_SLEEPING) {
2615 wake_up(&vc->wq); 2614 swake_up(&vc->wq);
2616 } 2615 }
2617 2616
2618 } 2617 }
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 8959ebb6d2c9..b0c8ad0799c7 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -467,7 +467,7 @@ struct kvm_s390_irq_payload {
467struct kvm_s390_local_interrupt { 467struct kvm_s390_local_interrupt {
468 spinlock_t lock; 468 spinlock_t lock;
469 struct kvm_s390_float_interrupt *float_int; 469 struct kvm_s390_float_interrupt *float_int;
470 wait_queue_head_t *wq; 470 struct swait_queue_head *wq;
471 atomic_t *cpuflags; 471 atomic_t *cpuflags;
472 DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS); 472 DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
473 struct kvm_s390_irq_payload irq; 473 struct kvm_s390_irq_payload irq;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index f88ca72c3a77..9ffc73221792 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -966,13 +966,13 @@ no_timer:
966 966
967void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) 967void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
968{ 968{
969 if (waitqueue_active(&vcpu->wq)) { 969 if (swait_active(&vcpu->wq)) {
970 /* 970 /*
971 * The vcpu gave up the cpu voluntarily, mark it as a good 971 * The vcpu gave up the cpu voluntarily, mark it as a good
972 * yield-candidate. 972 * yield-candidate.
973 */ 973 */
974 vcpu->preempted = true; 974 vcpu->preempted = true;
975 wake_up_interruptible(&vcpu->wq); 975 swake_up(&vcpu->wq);
976 vcpu->stat.halt_wakeup++; 976 vcpu->stat.halt_wakeup++;
977 } 977 }
978} 978}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 36591faed13b..3a045f39ed81 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1195,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
1195static void apic_timer_expired(struct kvm_lapic *apic) 1195static void apic_timer_expired(struct kvm_lapic *apic)
1196{ 1196{
1197 struct kvm_vcpu *vcpu = apic->vcpu; 1197 struct kvm_vcpu *vcpu = apic->vcpu;
1198 wait_queue_head_t *q = &vcpu->wq; 1198 struct swait_queue_head *q = &vcpu->wq;
1199 struct kvm_timer *ktimer = &apic->lapic_timer; 1199 struct kvm_timer *ktimer = &apic->lapic_timer;
1200 1200
1201 if (atomic_read(&apic->lapic_timer.pending)) 1201 if (atomic_read(&apic->lapic_timer.pending))
@@ -1204,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic)
1204 atomic_inc(&apic->lapic_timer.pending); 1204 atomic_inc(&apic->lapic_timer.pending);
1205 kvm_set_pending_timer(vcpu); 1205 kvm_set_pending_timer(vcpu);
1206 1206
1207 if (waitqueue_active(q)) 1207 if (swait_active(q))
1208 wake_up_interruptible(q); 1208 swake_up(q);
1209 1209
1210 if (apic_lvtt_tscdeadline(apic)) 1210 if (apic_lvtt_tscdeadline(apic))
1211 ktimer->expired_tscdeadline = ktimer->tscdeadline; 1211 ktimer->expired_tscdeadline = ktimer->tscdeadline;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index c2b340e23f62..6d9df3f7e334 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -713,6 +713,18 @@ static inline void __ftrace_enabled_restore(int enabled)
713#define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5)) 713#define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
714#define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6)) 714#define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
715 715
716static inline unsigned long get_lock_parent_ip(void)
717{
718 unsigned long addr = CALLER_ADDR0;
719
720 if (!in_lock_functions(addr))
721 return addr;
722 addr = CALLER_ADDR1;
723 if (!in_lock_functions(addr))
724 return addr;
725 return CALLER_ADDR2;
726}
727
716#ifdef CONFIG_IRQSOFF_TRACER 728#ifdef CONFIG_IRQSOFF_TRACER
717 extern void time_hardirqs_on(unsigned long a0, unsigned long a1); 729 extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
718 extern void time_hardirqs_off(unsigned long a0, unsigned long a1); 730 extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 861f690aa791..5276fe0916fc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -25,6 +25,7 @@
25#include <linux/irqflags.h> 25#include <linux/irqflags.h>
26#include <linux/context_tracking.h> 26#include <linux/context_tracking.h>
27#include <linux/irqbypass.h> 27#include <linux/irqbypass.h>
28#include <linux/swait.h>
28#include <asm/signal.h> 29#include <asm/signal.h>
29 30
30#include <linux/kvm.h> 31#include <linux/kvm.h>
@@ -218,7 +219,7 @@ struct kvm_vcpu {
218 int fpu_active; 219 int fpu_active;
219 int guest_fpu_loaded, guest_xcr0_loaded; 220 int guest_fpu_loaded, guest_xcr0_loaded;
220 unsigned char fpu_counter; 221 unsigned char fpu_counter;
221 wait_queue_head_t wq; 222 struct swait_queue_head wq;
222 struct pid *pid; 223 struct pid *pid;
223 int sigset_active; 224 int sigset_active;
224 sigset_t sigset; 225 sigset_t sigset;
@@ -782,7 +783,7 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
782} 783}
783#endif 784#endif
784 785
785static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) 786static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
786{ 787{
787#ifdef __KVM_HAVE_ARCH_WQP 788#ifdef __KVM_HAVE_ARCH_WQP
788 return vcpu->arch.wqp; 789 return vcpu->arch.wqp;
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
index e23121f9d82a..59ccab297ae0 100644
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -37,6 +37,9 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
37 37
38void clear_all_latency_tracing(struct task_struct *p); 38void clear_all_latency_tracing(struct task_struct *p);
39 39
40extern int sysctl_latencytop(struct ctl_table *table, int write,
41 void __user *buffer, size_t *lenp, loff_t *ppos);
42
40#else 43#else
41 44
42static inline void 45static inline void
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a10494a94cc3..838a89a78332 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -182,8 +182,6 @@ extern void update_cpu_load_nohz(int active);
182static inline void update_cpu_load_nohz(int active) { } 182static inline void update_cpu_load_nohz(int active) { }
183#endif 183#endif
184 184
185extern unsigned long get_parent_ip(unsigned long addr);
186
187extern void dump_cpu_task(int cpu); 185extern void dump_cpu_task(int cpu);
188 186
189struct seq_file; 187struct seq_file;
@@ -920,6 +918,10 @@ static inline int sched_info_on(void)
920#endif 918#endif
921} 919}
922 920
921#ifdef CONFIG_SCHEDSTATS
922void force_schedstat_enabled(void);
923#endif
924
923enum cpu_idle_type { 925enum cpu_idle_type {
924 CPU_IDLE, 926 CPU_IDLE,
925 CPU_NOT_IDLE, 927 CPU_NOT_IDLE,
@@ -1289,6 +1291,8 @@ struct sched_rt_entity {
1289 unsigned long timeout; 1291 unsigned long timeout;
1290 unsigned long watchdog_stamp; 1292 unsigned long watchdog_stamp;
1291 unsigned int time_slice; 1293 unsigned int time_slice;
1294 unsigned short on_rq;
1295 unsigned short on_list;
1292 1296
1293 struct sched_rt_entity *back; 1297 struct sched_rt_entity *back;
1294#ifdef CONFIG_RT_GROUP_SCHED 1298#ifdef CONFIG_RT_GROUP_SCHED
@@ -1329,10 +1333,6 @@ struct sched_dl_entity {
1329 * task has to wait for a replenishment to be performed at the 1333 * task has to wait for a replenishment to be performed at the
1330 * next firing of dl_timer. 1334 * next firing of dl_timer.
1331 * 1335 *
1332 * @dl_new tells if a new instance arrived. If so we must
1333 * start executing it with full runtime and reset its absolute
1334 * deadline;
1335 *
1336 * @dl_boosted tells if we are boosted due to DI. If so we are 1336 * @dl_boosted tells if we are boosted due to DI. If so we are
1337 * outside bandwidth enforcement mechanism (but only until we 1337 * outside bandwidth enforcement mechanism (but only until we
1338 * exit the critical section); 1338 * exit the critical section);
@@ -1340,7 +1340,7 @@ struct sched_dl_entity {
1340 * @dl_yielded tells if task gave up the cpu before consuming 1340 * @dl_yielded tells if task gave up the cpu before consuming
1341 * all its available runtime during the last job. 1341 * all its available runtime during the last job.
1342 */ 1342 */
1343 int dl_throttled, dl_new, dl_boosted, dl_yielded; 1343 int dl_throttled, dl_boosted, dl_yielded;
1344 1344
1345 /* 1345 /*
1346 * Bandwidth enforcement timer. Each -deadline task has its 1346 * Bandwidth enforcement timer. Each -deadline task has its
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index c9e4731cf10b..4f080ab4f2cd 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -95,4 +95,8 @@ extern int sysctl_numa_balancing(struct ctl_table *table, int write,
95 void __user *buffer, size_t *lenp, 95 void __user *buffer, size_t *lenp,
96 loff_t *ppos); 96 loff_t *ppos);
97 97
98extern int sysctl_schedstats(struct ctl_table *table, int write,
99 void __user *buffer, size_t *lenp,
100 loff_t *ppos);
101
98#endif /* _SCHED_SYSCTL_H */ 102#endif /* _SCHED_SYSCTL_H */
diff --git a/include/linux/swait.h b/include/linux/swait.h
new file mode 100644
index 000000000000..c1f9c62a8a50
--- /dev/null
+++ b/include/linux/swait.h
@@ -0,0 +1,172 @@
1#ifndef _LINUX_SWAIT_H
2#define _LINUX_SWAIT_H
3
4#include <linux/list.h>
5#include <linux/stddef.h>
6#include <linux/spinlock.h>
7#include <asm/current.h>
8
9/*
10 * Simple wait queues
11 *
12 * While these are very similar to the other/complex wait queues (wait.h) the
13 * most important difference is that the simple waitqueue allows for
14 * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
15 * times.
16 *
17 * In order to make this so, we had to drop a fair number of features of the
18 * other waitqueue code; notably:
19 *
20 * - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
21 * all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
22 * sleeper state.
23 *
24 * - the exclusive mode; because this requires preserving the list order
25 * and this is hard.
26 *
27 * - custom wake functions; because you cannot give any guarantees about
28 * random code.
29 *
30 * As a side effect of this; the data structures are slimmer.
31 *
32 * One would recommend using this wait queue where possible.
33 */
34
35struct task_struct;
36
37struct swait_queue_head {
38 raw_spinlock_t lock;
39 struct list_head task_list;
40};
41
42struct swait_queue {
43 struct task_struct *task;
44 struct list_head task_list;
45};
46
47#define __SWAITQUEUE_INITIALIZER(name) { \
48 .task = current, \
49 .task_list = LIST_HEAD_INIT((name).task_list), \
50}
51
52#define DECLARE_SWAITQUEUE(name) \
53 struct swait_queue name = __SWAITQUEUE_INITIALIZER(name)
54
55#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) { \
56 .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
57 .task_list = LIST_HEAD_INIT((name).task_list), \
58}
59
60#define DECLARE_SWAIT_QUEUE_HEAD(name) \
61 struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name)
62
63extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
64 struct lock_class_key *key);
65
66#define init_swait_queue_head(q) \
67 do { \
68 static struct lock_class_key __key; \
69 __init_swait_queue_head((q), #q, &__key); \
70 } while (0)
71
72#ifdef CONFIG_LOCKDEP
73# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name) \
74 ({ init_swait_queue_head(&name); name; })
75# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name) \
76 struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)
77#else
78# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name) \
79 DECLARE_SWAIT_QUEUE_HEAD(name)
80#endif
81
82static inline int swait_active(struct swait_queue_head *q)
83{
84 return !list_empty(&q->task_list);
85}
86
87extern void swake_up(struct swait_queue_head *q);
88extern void swake_up_all(struct swait_queue_head *q);
89extern void swake_up_locked(struct swait_queue_head *q);
90
91extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
92extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
93extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
94
95extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
96extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
97
98/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
99#define ___swait_event(wq, condition, state, ret, cmd) \
100({ \
101 struct swait_queue __wait; \
102 long __ret = ret; \
103 \
104 INIT_LIST_HEAD(&__wait.task_list); \
105 for (;;) { \
106 long __int = prepare_to_swait_event(&wq, &__wait, state);\
107 \
108 if (condition) \
109 break; \
110 \
111 if (___wait_is_interruptible(state) && __int) { \
112 __ret = __int; \
113 break; \
114 } \
115 \
116 cmd; \
117 } \
118 finish_swait(&wq, &__wait); \
119 __ret; \
120})
121
122#define __swait_event(wq, condition) \
123 (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \
124 schedule())
125
126#define swait_event(wq, condition) \
127do { \
128 if (condition) \
129 break; \
130 __swait_event(wq, condition); \
131} while (0)
132
133#define __swait_event_timeout(wq, condition, timeout) \
134 ___swait_event(wq, ___wait_cond_timeout(condition), \
135 TASK_UNINTERRUPTIBLE, timeout, \
136 __ret = schedule_timeout(__ret))
137
138#define swait_event_timeout(wq, condition, timeout) \
139({ \
140 long __ret = timeout; \
141 if (!___wait_cond_timeout(condition)) \
142 __ret = __swait_event_timeout(wq, condition, timeout); \
143 __ret; \
144})
145
146#define __swait_event_interruptible(wq, condition) \
147 ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \
148 schedule())
149
150#define swait_event_interruptible(wq, condition) \
151({ \
152 int __ret = 0; \
153 if (!(condition)) \
154 __ret = __swait_event_interruptible(wq, condition); \
155 __ret; \
156})
157
158#define __swait_event_interruptible_timeout(wq, condition, timeout) \
159 ___swait_event(wq, ___wait_cond_timeout(condition), \
160 TASK_INTERRUPTIBLE, timeout, \
161 __ret = schedule_timeout(__ret))
162
163#define swait_event_interruptible_timeout(wq, condition, timeout) \
164({ \
165 long __ret = timeout; \
166 if (!___wait_cond_timeout(condition)) \
167 __ret = __swait_event_interruptible_timeout(wq, \
168 condition, timeout); \
169 __ret; \
170})
171
172#endif /* _LINUX_SWAIT_H */
diff --git a/include/linux/wait.h b/include/linux/wait.h
index ae71a769b89e..27d7a0ab5da3 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -338,7 +338,7 @@ do { \
338 schedule(); try_to_freeze()) 338 schedule(); try_to_freeze())
339 339
340/** 340/**
341 * wait_event - sleep (or freeze) until a condition gets true 341 * wait_event_freezable - sleep (or freeze) until a condition gets true
342 * @wq: the waitqueue to wait on 342 * @wq: the waitqueue to wait on
343 * @condition: a C expression for the event to wait for 343 * @condition: a C expression for the event to wait for
344 * 344 *
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a02812743a7e..b5c30d9f46c5 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -47,12 +47,12 @@
47 * of times) 47 * of times)
48 */ 48 */
49 49
50#include <linux/latencytop.h>
51#include <linux/kallsyms.h> 50#include <linux/kallsyms.h>
52#include <linux/seq_file.h> 51#include <linux/seq_file.h>
53#include <linux/notifier.h> 52#include <linux/notifier.h>
54#include <linux/spinlock.h> 53#include <linux/spinlock.h>
55#include <linux/proc_fs.h> 54#include <linux/proc_fs.h>
55#include <linux/latencytop.h>
56#include <linux/export.h> 56#include <linux/export.h>
57#include <linux/sched.h> 57#include <linux/sched.h>
58#include <linux/list.h> 58#include <linux/list.h>
@@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void)
289 proc_create("latency_stats", 0644, NULL, &lstats_fops); 289 proc_create("latency_stats", 0644, NULL, &lstats_fops);
290 return 0; 290 return 0;
291} 291}
292
293int sysctl_latencytop(struct ctl_table *table, int write,
294 void __user *buffer, size_t *lenp, loff_t *ppos)
295{
296 int err;
297
298 err = proc_dointvec(table, write, buffer, lenp, ppos);
299 if (latencytop_enabled)
300 force_schedstat_enabled();
301
302 return err;
303}
292device_initcall(init_lstats_procfs); 304device_initcall(init_lstats_procfs);
diff --git a/kernel/profile.c b/kernel/profile.c
index 99513e1160e5..51369697466e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -59,6 +59,7 @@ int profile_setup(char *str)
59 59
60 if (!strncmp(str, sleepstr, strlen(sleepstr))) { 60 if (!strncmp(str, sleepstr, strlen(sleepstr))) {
61#ifdef CONFIG_SCHEDSTATS 61#ifdef CONFIG_SCHEDSTATS
62 force_schedstat_enabled();
62 prof_on = SLEEP_PROFILING; 63 prof_on = SLEEP_PROFILING;
63 if (str[strlen(sleepstr)] == ',') 64 if (str[strlen(sleepstr)] == ',')
64 str += strlen(sleepstr) + 1; 65 str += strlen(sleepstr) + 1;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e41dd4131f7a..9fd5b628a88d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1614,7 +1614,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1614 int needmore; 1614 int needmore;
1615 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1615 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1616 1616
1617 rcu_nocb_gp_cleanup(rsp, rnp);
1618 rnp->need_future_gp[c & 0x1] = 0; 1617 rnp->need_future_gp[c & 0x1] = 0;
1619 needmore = rnp->need_future_gp[(c + 1) & 0x1]; 1618 needmore = rnp->need_future_gp[(c + 1) & 0x1];
1620 trace_rcu_future_gp(rnp, rdp, c, 1619 trace_rcu_future_gp(rnp, rdp, c,
@@ -1635,7 +1634,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
1635 !READ_ONCE(rsp->gp_flags) || 1634 !READ_ONCE(rsp->gp_flags) ||
1636 !rsp->gp_kthread) 1635 !rsp->gp_kthread)
1637 return; 1636 return;
1638 wake_up(&rsp->gp_wq); 1637 swake_up(&rsp->gp_wq);
1639} 1638}
1640 1639
1641/* 1640/*
@@ -2010,6 +2009,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
2010 int nocb = 0; 2009 int nocb = 0;
2011 struct rcu_data *rdp; 2010 struct rcu_data *rdp;
2012 struct rcu_node *rnp = rcu_get_root(rsp); 2011 struct rcu_node *rnp = rcu_get_root(rsp);
2012 struct swait_queue_head *sq;
2013 2013
2014 WRITE_ONCE(rsp->gp_activity, jiffies); 2014 WRITE_ONCE(rsp->gp_activity, jiffies);
2015 raw_spin_lock_irq_rcu_node(rnp); 2015 raw_spin_lock_irq_rcu_node(rnp);
@@ -2046,7 +2046,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
2046 needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; 2046 needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
2047 /* smp_mb() provided by prior unlock-lock pair. */ 2047 /* smp_mb() provided by prior unlock-lock pair. */
2048 nocb += rcu_future_gp_cleanup(rsp, rnp); 2048 nocb += rcu_future_gp_cleanup(rsp, rnp);
2049 sq = rcu_nocb_gp_get(rnp);
2049 raw_spin_unlock_irq(&rnp->lock); 2050 raw_spin_unlock_irq(&rnp->lock);
2051 rcu_nocb_gp_cleanup(sq);
2050 cond_resched_rcu_qs(); 2052 cond_resched_rcu_qs();
2051 WRITE_ONCE(rsp->gp_activity, jiffies); 2053 WRITE_ONCE(rsp->gp_activity, jiffies);
2052 rcu_gp_slow(rsp, gp_cleanup_delay); 2054 rcu_gp_slow(rsp, gp_cleanup_delay);
@@ -2092,7 +2094,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
2092 READ_ONCE(rsp->gpnum), 2094 READ_ONCE(rsp->gpnum),
2093 TPS("reqwait")); 2095 TPS("reqwait"));
2094 rsp->gp_state = RCU_GP_WAIT_GPS; 2096 rsp->gp_state = RCU_GP_WAIT_GPS;
2095 wait_event_interruptible(rsp->gp_wq, 2097 swait_event_interruptible(rsp->gp_wq,
2096 READ_ONCE(rsp->gp_flags) & 2098 READ_ONCE(rsp->gp_flags) &
2097 RCU_GP_FLAG_INIT); 2099 RCU_GP_FLAG_INIT);
2098 rsp->gp_state = RCU_GP_DONE_GPS; 2100 rsp->gp_state = RCU_GP_DONE_GPS;
@@ -2122,7 +2124,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
2122 READ_ONCE(rsp->gpnum), 2124 READ_ONCE(rsp->gpnum),
2123 TPS("fqswait")); 2125 TPS("fqswait"));
2124 rsp->gp_state = RCU_GP_WAIT_FQS; 2126 rsp->gp_state = RCU_GP_WAIT_FQS;
2125 ret = wait_event_interruptible_timeout(rsp->gp_wq, 2127 ret = swait_event_interruptible_timeout(rsp->gp_wq,
2126 rcu_gp_fqs_check_wake(rsp, &gf), j); 2128 rcu_gp_fqs_check_wake(rsp, &gf), j);
2127 rsp->gp_state = RCU_GP_DOING_FQS; 2129 rsp->gp_state = RCU_GP_DOING_FQS;
2128 /* Locking provides needed memory barriers. */ 2130 /* Locking provides needed memory barriers. */
@@ -2246,7 +2248,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
2246 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 2248 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
2247 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); 2249 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
2248 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); 2250 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
2249 rcu_gp_kthread_wake(rsp); 2251 swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
2250} 2252}
2251 2253
2252/* 2254/*
@@ -2900,7 +2902,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
2900 } 2902 }
2901 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); 2903 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
2902 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2904 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2903 rcu_gp_kthread_wake(rsp); 2905 swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
2904} 2906}
2905 2907
2906/* 2908/*
@@ -3529,7 +3531,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
3529 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3531 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3530 if (wake) { 3532 if (wake) {
3531 smp_mb(); /* EGP done before wake_up(). */ 3533 smp_mb(); /* EGP done before wake_up(). */
3532 wake_up(&rsp->expedited_wq); 3534 swake_up(&rsp->expedited_wq);
3533 } 3535 }
3534 break; 3536 break;
3535 } 3537 }
@@ -3780,7 +3782,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
3780 jiffies_start = jiffies; 3782 jiffies_start = jiffies;
3781 3783
3782 for (;;) { 3784 for (;;) {
3783 ret = wait_event_interruptible_timeout( 3785 ret = swait_event_timeout(
3784 rsp->expedited_wq, 3786 rsp->expedited_wq,
3785 sync_rcu_preempt_exp_done(rnp_root), 3787 sync_rcu_preempt_exp_done(rnp_root),
3786 jiffies_stall); 3788 jiffies_stall);
@@ -3788,7 +3790,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
3788 return; 3790 return;
3789 if (ret < 0) { 3791 if (ret < 0) {
3790 /* Hit a signal, disable CPU stall warnings. */ 3792 /* Hit a signal, disable CPU stall warnings. */
3791 wait_event(rsp->expedited_wq, 3793 swait_event(rsp->expedited_wq,
3792 sync_rcu_preempt_exp_done(rnp_root)); 3794 sync_rcu_preempt_exp_done(rnp_root));
3793 return; 3795 return;
3794 } 3796 }
@@ -4482,8 +4484,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
4482 } 4484 }
4483 } 4485 }
4484 4486
4485 init_waitqueue_head(&rsp->gp_wq); 4487 init_swait_queue_head(&rsp->gp_wq);
4486 init_waitqueue_head(&rsp->expedited_wq); 4488 init_swait_queue_head(&rsp->expedited_wq);
4487 rnp = rsp->level[rcu_num_lvls - 1]; 4489 rnp = rsp->level[rcu_num_lvls - 1];
4488 for_each_possible_cpu(i) { 4490 for_each_possible_cpu(i) {
4489 while (i > rnp->grphi) 4491 while (i > rnp->grphi)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 83360b4f4352..bbd235d0e71f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,6 +27,7 @@
27#include <linux/threads.h> 27#include <linux/threads.h>
28#include <linux/cpumask.h> 28#include <linux/cpumask.h>
29#include <linux/seqlock.h> 29#include <linux/seqlock.h>
30#include <linux/swait.h>
30#include <linux/stop_machine.h> 31#include <linux/stop_machine.h>
31 32
32/* 33/*
@@ -243,7 +244,7 @@ struct rcu_node {
243 /* Refused to boost: not sure why, though. */ 244 /* Refused to boost: not sure why, though. */
244 /* This can happen due to race conditions. */ 245 /* This can happen due to race conditions. */
245#ifdef CONFIG_RCU_NOCB_CPU 246#ifdef CONFIG_RCU_NOCB_CPU
246 wait_queue_head_t nocb_gp_wq[2]; 247 struct swait_queue_head nocb_gp_wq[2];
247 /* Place for rcu_nocb_kthread() to wait GP. */ 248 /* Place for rcu_nocb_kthread() to wait GP. */
248#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 249#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
249 int need_future_gp[2]; 250 int need_future_gp[2];
@@ -399,7 +400,7 @@ struct rcu_data {
399 atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ 400 atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
400 struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ 401 struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
401 struct rcu_head **nocb_follower_tail; 402 struct rcu_head **nocb_follower_tail;
402 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ 403 struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
403 struct task_struct *nocb_kthread; 404 struct task_struct *nocb_kthread;
404 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ 405 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
405 406
@@ -478,7 +479,7 @@ struct rcu_state {
478 unsigned long gpnum; /* Current gp number. */ 479 unsigned long gpnum; /* Current gp number. */
479 unsigned long completed; /* # of last completed gp. */ 480 unsigned long completed; /* # of last completed gp. */
480 struct task_struct *gp_kthread; /* Task for grace periods. */ 481 struct task_struct *gp_kthread; /* Task for grace periods. */
481 wait_queue_head_t gp_wq; /* Where GP task waits. */ 482 struct swait_queue_head gp_wq; /* Where GP task waits. */
482 short gp_flags; /* Commands for GP task. */ 483 short gp_flags; /* Commands for GP task. */
483 short gp_state; /* GP kthread sleep state. */ 484 short gp_state; /* GP kthread sleep state. */
484 485
@@ -506,7 +507,7 @@ struct rcu_state {
506 unsigned long expedited_sequence; /* Take a ticket. */ 507 unsigned long expedited_sequence; /* Take a ticket. */
507 atomic_long_t expedited_normal; /* # fallbacks to normal. */ 508 atomic_long_t expedited_normal; /* # fallbacks to normal. */
508 atomic_t expedited_need_qs; /* # CPUs left to check in. */ 509 atomic_t expedited_need_qs; /* # CPUs left to check in. */
509 wait_queue_head_t expedited_wq; /* Wait for check-ins. */ 510 struct swait_queue_head expedited_wq; /* Wait for check-ins. */
510 int ncpus_snap; /* # CPUs seen last time. */ 511 int ncpus_snap; /* # CPUs seen last time. */
511 512
512 unsigned long jiffies_force_qs; /* Time at which to invoke */ 513 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -621,7 +622,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
621static void increment_cpu_stall_ticks(void); 622static void increment_cpu_stall_ticks(void);
622static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); 623static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
623static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); 624static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
624static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); 625static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
626static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
625static void rcu_init_one_nocb(struct rcu_node *rnp); 627static void rcu_init_one_nocb(struct rcu_node *rnp);
626static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 628static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
627 bool lazy, unsigned long flags); 629 bool lazy, unsigned long flags);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 9467a8b7e756..080bd202d360 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1811,9 +1811,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
1811 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended 1811 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
1812 * grace period. 1812 * grace period.
1813 */ 1813 */
1814static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 1814static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
1815{ 1815{
1816 wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); 1816 swake_up_all(sq);
1817} 1817}
1818 1818
1819/* 1819/*
@@ -1829,10 +1829,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
1829 rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; 1829 rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
1830} 1830}
1831 1831
1832static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
1833{
1834 return &rnp->nocb_gp_wq[rnp->completed & 0x1];
1835}
1836
1832static void rcu_init_one_nocb(struct rcu_node *rnp) 1837static void rcu_init_one_nocb(struct rcu_node *rnp)
1833{ 1838{
1834 init_waitqueue_head(&rnp->nocb_gp_wq[0]); 1839 init_swait_queue_head(&rnp->nocb_gp_wq[0]);
1835 init_waitqueue_head(&rnp->nocb_gp_wq[1]); 1840 init_swait_queue_head(&rnp->nocb_gp_wq[1]);
1836} 1841}
1837 1842
1838#ifndef CONFIG_RCU_NOCB_CPU_ALL 1843#ifndef CONFIG_RCU_NOCB_CPU_ALL
@@ -1857,7 +1862,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
1857 if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { 1862 if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
1858 /* Prior smp_mb__after_atomic() orders against prior enqueue. */ 1863 /* Prior smp_mb__after_atomic() orders against prior enqueue. */
1859 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); 1864 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
1860 wake_up(&rdp_leader->nocb_wq); 1865 swake_up(&rdp_leader->nocb_wq);
1861 } 1866 }
1862} 1867}
1863 1868
@@ -2069,7 +2074,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2069 */ 2074 */
2070 trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); 2075 trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
2071 for (;;) { 2076 for (;;) {
2072 wait_event_interruptible( 2077 swait_event_interruptible(
2073 rnp->nocb_gp_wq[c & 0x1], 2078 rnp->nocb_gp_wq[c & 0x1],
2074 (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); 2079 (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
2075 if (likely(d)) 2080 if (likely(d))
@@ -2097,7 +2102,7 @@ wait_again:
2097 /* Wait for callbacks to appear. */ 2102 /* Wait for callbacks to appear. */
2098 if (!rcu_nocb_poll) { 2103 if (!rcu_nocb_poll) {
2099 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); 2104 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
2100 wait_event_interruptible(my_rdp->nocb_wq, 2105 swait_event_interruptible(my_rdp->nocb_wq,
2101 !READ_ONCE(my_rdp->nocb_leader_sleep)); 2106 !READ_ONCE(my_rdp->nocb_leader_sleep));
2102 /* Memory barrier handled by smp_mb() calls below and repoll. */ 2107 /* Memory barrier handled by smp_mb() calls below and repoll. */
2103 } else if (firsttime) { 2108 } else if (firsttime) {
@@ -2172,7 +2177,7 @@ wait_again:
2172 * List was empty, wake up the follower. 2177 * List was empty, wake up the follower.
2173 * Memory barriers supplied by atomic_long_add(). 2178 * Memory barriers supplied by atomic_long_add().
2174 */ 2179 */
2175 wake_up(&rdp->nocb_wq); 2180 swake_up(&rdp->nocb_wq);
2176 } 2181 }
2177 } 2182 }
2178 2183
@@ -2193,7 +2198,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
2193 if (!rcu_nocb_poll) { 2198 if (!rcu_nocb_poll) {
2194 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2199 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2195 "FollowerSleep"); 2200 "FollowerSleep");
2196 wait_event_interruptible(rdp->nocb_wq, 2201 swait_event_interruptible(rdp->nocb_wq,
2197 READ_ONCE(rdp->nocb_follower_head)); 2202 READ_ONCE(rdp->nocb_follower_head));
2198 } else if (firsttime) { 2203 } else if (firsttime) {
2199 /* Don't drown trace log with "Poll"! */ 2204 /* Don't drown trace log with "Poll"! */
@@ -2352,7 +2357,7 @@ void __init rcu_init_nohz(void)
2352static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2357static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2353{ 2358{
2354 rdp->nocb_tail = &rdp->nocb_head; 2359 rdp->nocb_tail = &rdp->nocb_head;
2355 init_waitqueue_head(&rdp->nocb_wq); 2360 init_swait_queue_head(&rdp->nocb_wq);
2356 rdp->nocb_follower_tail = &rdp->nocb_follower_head; 2361 rdp->nocb_follower_tail = &rdp->nocb_follower_head;
2357} 2362}
2358 2363
@@ -2502,7 +2507,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
2502 return false; 2507 return false;
2503} 2508}
2504 2509
2505static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 2510static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
2506{ 2511{
2507} 2512}
2508 2513
@@ -2510,6 +2515,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
2510{ 2515{
2511} 2516}
2512 2517
2518static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
2519{
2520 return NULL;
2521}
2522
2513static void rcu_init_one_nocb(struct rcu_node *rnp) 2523static void rcu_init_one_nocb(struct rcu_node *rnp)
2514{ 2524{
2515} 2525}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 67687973ce80..7d4cba227cbd 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
13 13
14obj-y += core.o loadavg.o clock.o cputime.o 14obj-y += core.o loadavg.o clock.o cputime.o
15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o 15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
16obj-y += wait.o completion.o idle.o 16obj-y += wait.o swait.o completion.o idle.o
17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o 17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
19obj-$(CONFIG_SCHEDSTATS) += stats.o 19obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 41f6b2215aa8..05114b15b6d1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -67,12 +67,10 @@
67#include <linux/pagemap.h> 67#include <linux/pagemap.h>
68#include <linux/hrtimer.h> 68#include <linux/hrtimer.h>
69#include <linux/tick.h> 69#include <linux/tick.h>
70#include <linux/debugfs.h>
71#include <linux/ctype.h> 70#include <linux/ctype.h>
72#include <linux/ftrace.h> 71#include <linux/ftrace.h>
73#include <linux/slab.h> 72#include <linux/slab.h>
74#include <linux/init_task.h> 73#include <linux/init_task.h>
75#include <linux/binfmts.h>
76#include <linux/context_tracking.h> 74#include <linux/context_tracking.h>
77#include <linux/compiler.h> 75#include <linux/compiler.h>
78 76
@@ -125,138 +123,6 @@ const_debug unsigned int sysctl_sched_features =
125 123
126#undef SCHED_FEAT 124#undef SCHED_FEAT
127 125
128#ifdef CONFIG_SCHED_DEBUG
129#define SCHED_FEAT(name, enabled) \
130 #name ,
131
132static const char * const sched_feat_names[] = {
133#include "features.h"
134};
135
136#undef SCHED_FEAT
137
138static int sched_feat_show(struct seq_file *m, void *v)
139{
140 int i;
141
142 for (i = 0; i < __SCHED_FEAT_NR; i++) {
143 if (!(sysctl_sched_features & (1UL << i)))
144 seq_puts(m, "NO_");
145 seq_printf(m, "%s ", sched_feat_names[i]);
146 }
147 seq_puts(m, "\n");
148
149 return 0;
150}
151
152#ifdef HAVE_JUMP_LABEL
153
154#define jump_label_key__true STATIC_KEY_INIT_TRUE
155#define jump_label_key__false STATIC_KEY_INIT_FALSE
156
157#define SCHED_FEAT(name, enabled) \
158 jump_label_key__##enabled ,
159
160struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
161#include "features.h"
162};
163
164#undef SCHED_FEAT
165
166static void sched_feat_disable(int i)
167{
168 static_key_disable(&sched_feat_keys[i]);
169}
170
171static void sched_feat_enable(int i)
172{
173 static_key_enable(&sched_feat_keys[i]);
174}
175#else
176static void sched_feat_disable(int i) { };
177static void sched_feat_enable(int i) { };
178#endif /* HAVE_JUMP_LABEL */
179
180static int sched_feat_set(char *cmp)
181{
182 int i;
183 int neg = 0;
184
185 if (strncmp(cmp, "NO_", 3) == 0) {
186 neg = 1;
187 cmp += 3;
188 }
189
190 for (i = 0; i < __SCHED_FEAT_NR; i++) {
191 if (strcmp(cmp, sched_feat_names[i]) == 0) {
192 if (neg) {
193 sysctl_sched_features &= ~(1UL << i);
194 sched_feat_disable(i);
195 } else {
196 sysctl_sched_features |= (1UL << i);
197 sched_feat_enable(i);
198 }
199 break;
200 }
201 }
202
203 return i;
204}
205
206static ssize_t
207sched_feat_write(struct file *filp, const char __user *ubuf,
208 size_t cnt, loff_t *ppos)
209{
210 char buf[64];
211 char *cmp;
212 int i;
213 struct inode *inode;
214
215 if (cnt > 63)
216 cnt = 63;
217
218 if (copy_from_user(&buf, ubuf, cnt))
219 return -EFAULT;
220
221 buf[cnt] = 0;
222 cmp = strstrip(buf);
223
224 /* Ensure the static_key remains in a consistent state */
225 inode = file_inode(filp);
226 inode_lock(inode);
227 i = sched_feat_set(cmp);
228 inode_unlock(inode);
229 if (i == __SCHED_FEAT_NR)
230 return -EINVAL;
231
232 *ppos += cnt;
233
234 return cnt;
235}
236
237static int sched_feat_open(struct inode *inode, struct file *filp)
238{
239 return single_open(filp, sched_feat_show, NULL);
240}
241
242static const struct file_operations sched_feat_fops = {
243 .open = sched_feat_open,
244 .write = sched_feat_write,
245 .read = seq_read,
246 .llseek = seq_lseek,
247 .release = single_release,
248};
249
250static __init int sched_init_debug(void)
251{
252 debugfs_create_file("sched_features", 0644, NULL, NULL,
253 &sched_feat_fops);
254
255 return 0;
256}
257late_initcall(sched_init_debug);
258#endif /* CONFIG_SCHED_DEBUG */
259
260/* 126/*
261 * Number of tasks to iterate in a single balance run. 127 * Number of tasks to iterate in a single balance run.
262 * Limited because this is done with IRQs disabled. 128 * Limited because this is done with IRQs disabled.
@@ -2094,7 +1960,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2094 1960
2095 ttwu_queue(p, cpu); 1961 ttwu_queue(p, cpu);
2096stat: 1962stat:
2097 ttwu_stat(p, cpu, wake_flags); 1963 if (schedstat_enabled())
1964 ttwu_stat(p, cpu, wake_flags);
2098out: 1965out:
2099 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1966 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2100 1967
@@ -2142,7 +2009,8 @@ static void try_to_wake_up_local(struct task_struct *p)
2142 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 2009 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2143 2010
2144 ttwu_do_wakeup(rq, p, 0); 2011 ttwu_do_wakeup(rq, p, 0);
2145 ttwu_stat(p, smp_processor_id(), 0); 2012 if (schedstat_enabled())
2013 ttwu_stat(p, smp_processor_id(), 0);
2146out: 2014out:
2147 raw_spin_unlock(&p->pi_lock); 2015 raw_spin_unlock(&p->pi_lock);
2148} 2016}
@@ -2184,7 +2052,6 @@ void __dl_clear_params(struct task_struct *p)
2184 dl_se->dl_bw = 0; 2052 dl_se->dl_bw = 0;
2185 2053
2186 dl_se->dl_throttled = 0; 2054 dl_se->dl_throttled = 0;
2187 dl_se->dl_new = 1;
2188 dl_se->dl_yielded = 0; 2055 dl_se->dl_yielded = 0;
2189} 2056}
2190 2057
@@ -2211,6 +2078,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2211#endif 2078#endif
2212 2079
2213#ifdef CONFIG_SCHEDSTATS 2080#ifdef CONFIG_SCHEDSTATS
2081 /* Even if schedstat is disabled, there should not be garbage */
2214 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2082 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2215#endif 2083#endif
2216 2084
@@ -2219,6 +2087,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2219 __dl_clear_params(p); 2087 __dl_clear_params(p);
2220 2088
2221 INIT_LIST_HEAD(&p->rt.run_list); 2089 INIT_LIST_HEAD(&p->rt.run_list);
2090 p->rt.timeout = 0;
2091 p->rt.time_slice = sched_rr_timeslice;
2092 p->rt.on_rq = 0;
2093 p->rt.on_list = 0;
2222 2094
2223#ifdef CONFIG_PREEMPT_NOTIFIERS 2095#ifdef CONFIG_PREEMPT_NOTIFIERS
2224 INIT_HLIST_HEAD(&p->preempt_notifiers); 2096 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2282,6 +2154,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
2282#endif 2154#endif
2283#endif 2155#endif
2284 2156
2157DEFINE_STATIC_KEY_FALSE(sched_schedstats);
2158
2159#ifdef CONFIG_SCHEDSTATS
2160static void set_schedstats(bool enabled)
2161{
2162 if (enabled)
2163 static_branch_enable(&sched_schedstats);
2164 else
2165 static_branch_disable(&sched_schedstats);
2166}
2167
2168void force_schedstat_enabled(void)
2169{
2170 if (!schedstat_enabled()) {
2171 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
2172 static_branch_enable(&sched_schedstats);
2173 }
2174}
2175
2176static int __init setup_schedstats(char *str)
2177{
2178 int ret = 0;
2179 if (!str)
2180 goto out;
2181
2182 if (!strcmp(str, "enable")) {
2183 set_schedstats(true);
2184 ret = 1;
2185 } else if (!strcmp(str, "disable")) {
2186 set_schedstats(false);
2187 ret = 1;
2188 }
2189out:
2190 if (!ret)
2191 pr_warn("Unable to parse schedstats=\n");
2192
2193 return ret;
2194}
2195__setup("schedstats=", setup_schedstats);
2196
2197#ifdef CONFIG_PROC_SYSCTL
2198int sysctl_schedstats(struct ctl_table *table, int write,
2199 void __user *buffer, size_t *lenp, loff_t *ppos)
2200{
2201 struct ctl_table t;
2202 int err;
2203 int state = static_branch_likely(&sched_schedstats);
2204
2205 if (write && !capable(CAP_SYS_ADMIN))
2206 return -EPERM;
2207
2208 t = *table;
2209 t.data = &state;
2210 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2211 if (err < 0)
2212 return err;
2213 if (write)
2214 set_schedstats(state);
2215 return err;
2216}
2217#endif
2218#endif
2219
2285/* 2220/*
2286 * fork()/clone()-time setup: 2221 * fork()/clone()-time setup:
2287 */ 2222 */
@@ -3011,16 +2946,6 @@ u64 scheduler_tick_max_deferment(void)
3011} 2946}
3012#endif 2947#endif
3013 2948
3014notrace unsigned long get_parent_ip(unsigned long addr)
3015{
3016 if (in_lock_functions(addr)) {
3017 addr = CALLER_ADDR2;
3018 if (in_lock_functions(addr))
3019 addr = CALLER_ADDR3;
3020 }
3021 return addr;
3022}
3023
3024#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2949#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3025 defined(CONFIG_PREEMPT_TRACER)) 2950 defined(CONFIG_PREEMPT_TRACER))
3026 2951
@@ -3042,7 +2967,7 @@ void preempt_count_add(int val)
3042 PREEMPT_MASK - 10); 2967 PREEMPT_MASK - 10);
3043#endif 2968#endif
3044 if (preempt_count() == val) { 2969 if (preempt_count() == val) {
3045 unsigned long ip = get_parent_ip(CALLER_ADDR1); 2970 unsigned long ip = get_lock_parent_ip();
3046#ifdef CONFIG_DEBUG_PREEMPT 2971#ifdef CONFIG_DEBUG_PREEMPT
3047 current->preempt_disable_ip = ip; 2972 current->preempt_disable_ip = ip;
3048#endif 2973#endif
@@ -3069,7 +2994,7 @@ void preempt_count_sub(int val)
3069#endif 2994#endif
3070 2995
3071 if (preempt_count() == val) 2996 if (preempt_count() == val)
3072 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2997 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
3073 __preempt_count_sub(val); 2998 __preempt_count_sub(val);
3074} 2999}
3075EXPORT_SYMBOL(preempt_count_sub); 3000EXPORT_SYMBOL(preempt_count_sub);
@@ -3281,7 +3206,6 @@ static void __sched notrace __schedule(bool preempt)
3281 3206
3282 trace_sched_switch(preempt, prev, next); 3207 trace_sched_switch(preempt, prev, next);
3283 rq = context_switch(rq, prev, next); /* unlocks the rq */ 3208 rq = context_switch(rq, prev, next); /* unlocks the rq */
3284 cpu = cpu_of(rq);
3285 } else { 3209 } else {
3286 lockdep_unpin_lock(&rq->lock); 3210 lockdep_unpin_lock(&rq->lock);
3287 raw_spin_unlock_irq(&rq->lock); 3211 raw_spin_unlock_irq(&rq->lock);
@@ -3467,7 +3391,7 @@ EXPORT_SYMBOL(default_wake_function);
3467 */ 3391 */
3468void rt_mutex_setprio(struct task_struct *p, int prio) 3392void rt_mutex_setprio(struct task_struct *p, int prio)
3469{ 3393{
3470 int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; 3394 int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
3471 struct rq *rq; 3395 struct rq *rq;
3472 const struct sched_class *prev_class; 3396 const struct sched_class *prev_class;
3473 3397
@@ -3495,11 +3419,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3495 3419
3496 trace_sched_pi_setprio(p, prio); 3420 trace_sched_pi_setprio(p, prio);
3497 oldprio = p->prio; 3421 oldprio = p->prio;
3422
3423 if (oldprio == prio)
3424 queue_flag &= ~DEQUEUE_MOVE;
3425
3498 prev_class = p->sched_class; 3426 prev_class = p->sched_class;
3499 queued = task_on_rq_queued(p); 3427 queued = task_on_rq_queued(p);
3500 running = task_current(rq, p); 3428 running = task_current(rq, p);
3501 if (queued) 3429 if (queued)
3502 dequeue_task(rq, p, DEQUEUE_SAVE); 3430 dequeue_task(rq, p, queue_flag);
3503 if (running) 3431 if (running)
3504 put_prev_task(rq, p); 3432 put_prev_task(rq, p);
3505 3433
@@ -3517,7 +3445,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3517 if (!dl_prio(p->normal_prio) || 3445 if (!dl_prio(p->normal_prio) ||
3518 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { 3446 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
3519 p->dl.dl_boosted = 1; 3447 p->dl.dl_boosted = 1;
3520 enqueue_flag |= ENQUEUE_REPLENISH; 3448 queue_flag |= ENQUEUE_REPLENISH;
3521 } else 3449 } else
3522 p->dl.dl_boosted = 0; 3450 p->dl.dl_boosted = 0;
3523 p->sched_class = &dl_sched_class; 3451 p->sched_class = &dl_sched_class;
@@ -3525,7 +3453,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3525 if (dl_prio(oldprio)) 3453 if (dl_prio(oldprio))
3526 p->dl.dl_boosted = 0; 3454 p->dl.dl_boosted = 0;
3527 if (oldprio < prio) 3455 if (oldprio < prio)
3528 enqueue_flag |= ENQUEUE_HEAD; 3456 queue_flag |= ENQUEUE_HEAD;
3529 p->sched_class = &rt_sched_class; 3457 p->sched_class = &rt_sched_class;
3530 } else { 3458 } else {
3531 if (dl_prio(oldprio)) 3459 if (dl_prio(oldprio))
@@ -3540,7 +3468,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3540 if (running) 3468 if (running)
3541 p->sched_class->set_curr_task(rq); 3469 p->sched_class->set_curr_task(rq);
3542 if (queued) 3470 if (queued)
3543 enqueue_task(rq, p, enqueue_flag); 3471 enqueue_task(rq, p, queue_flag);
3544 3472
3545 check_class_changed(rq, p, prev_class, oldprio); 3473 check_class_changed(rq, p, prev_class, oldprio);
3546out_unlock: 3474out_unlock:
@@ -3896,6 +3824,7 @@ static int __sched_setscheduler(struct task_struct *p,
3896 const struct sched_class *prev_class; 3824 const struct sched_class *prev_class;
3897 struct rq *rq; 3825 struct rq *rq;
3898 int reset_on_fork; 3826 int reset_on_fork;
3827 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
3899 3828
3900 /* may grab non-irq protected spin_locks */ 3829 /* may grab non-irq protected spin_locks */
3901 BUG_ON(in_interrupt()); 3830 BUG_ON(in_interrupt());
@@ -4078,17 +4007,14 @@ change:
4078 * itself. 4007 * itself.
4079 */ 4008 */
4080 new_effective_prio = rt_mutex_get_effective_prio(p, newprio); 4009 new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
4081 if (new_effective_prio == oldprio) { 4010 if (new_effective_prio == oldprio)
4082 __setscheduler_params(p, attr); 4011 queue_flags &= ~DEQUEUE_MOVE;
4083 task_rq_unlock(rq, p, &flags);
4084 return 0;
4085 }
4086 } 4012 }
4087 4013
4088 queued = task_on_rq_queued(p); 4014 queued = task_on_rq_queued(p);
4089 running = task_current(rq, p); 4015 running = task_current(rq, p);
4090 if (queued) 4016 if (queued)
4091 dequeue_task(rq, p, DEQUEUE_SAVE); 4017 dequeue_task(rq, p, queue_flags);
4092 if (running) 4018 if (running)
4093 put_prev_task(rq, p); 4019 put_prev_task(rq, p);
4094 4020
@@ -4098,15 +4024,14 @@ change:
4098 if (running) 4024 if (running)
4099 p->sched_class->set_curr_task(rq); 4025 p->sched_class->set_curr_task(rq);
4100 if (queued) { 4026 if (queued) {
4101 int enqueue_flags = ENQUEUE_RESTORE;
4102 /* 4027 /*
4103 * We enqueue to tail when the priority of a task is 4028 * We enqueue to tail when the priority of a task is
4104 * increased (user space view). 4029 * increased (user space view).
4105 */ 4030 */
4106 if (oldprio <= p->prio) 4031 if (oldprio < p->prio)
4107 enqueue_flags |= ENQUEUE_HEAD; 4032 queue_flags |= ENQUEUE_HEAD;
4108 4033
4109 enqueue_task(rq, p, enqueue_flags); 4034 enqueue_task(rq, p, queue_flags);
4110 } 4035 }
4111 4036
4112 check_class_changed(rq, p, prev_class, oldprio); 4037 check_class_changed(rq, p, prev_class, oldprio);
@@ -5408,183 +5333,6 @@ static void migrate_tasks(struct rq *dead_rq)
5408} 5333}
5409#endif /* CONFIG_HOTPLUG_CPU */ 5334#endif /* CONFIG_HOTPLUG_CPU */
5410 5335
5411#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5412
5413static struct ctl_table sd_ctl_dir[] = {
5414 {
5415 .procname = "sched_domain",
5416 .mode = 0555,
5417 },
5418 {}
5419};
5420
5421static struct ctl_table sd_ctl_root[] = {
5422 {
5423 .procname = "kernel",
5424 .mode = 0555,
5425 .child = sd_ctl_dir,
5426 },
5427 {}
5428};
5429
5430static struct ctl_table *sd_alloc_ctl_entry(int n)
5431{
5432 struct ctl_table *entry =
5433 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5434
5435 return entry;
5436}
5437
5438static void sd_free_ctl_entry(struct ctl_table **tablep)
5439{
5440 struct ctl_table *entry;
5441
5442 /*
5443 * In the intermediate directories, both the child directory and
5444 * procname are dynamically allocated and could fail but the mode
5445 * will always be set. In the lowest directory the names are
5446 * static strings and all have proc handlers.
5447 */
5448 for (entry = *tablep; entry->mode; entry++) {
5449 if (entry->child)
5450 sd_free_ctl_entry(&entry->child);
5451 if (entry->proc_handler == NULL)
5452 kfree(entry->procname);
5453 }
5454
5455 kfree(*tablep);
5456 *tablep = NULL;
5457}
5458
5459static int min_load_idx = 0;
5460static int max_load_idx = CPU_LOAD_IDX_MAX-1;
5461
5462static void
5463set_table_entry(struct ctl_table *entry,
5464 const char *procname, void *data, int maxlen,
5465 umode_t mode, proc_handler *proc_handler,
5466 bool load_idx)
5467{
5468 entry->procname = procname;
5469 entry->data = data;
5470 entry->maxlen = maxlen;
5471 entry->mode = mode;
5472 entry->proc_handler = proc_handler;
5473
5474 if (load_idx) {
5475 entry->extra1 = &min_load_idx;
5476 entry->extra2 = &max_load_idx;
5477 }
5478}
5479
5480static struct ctl_table *
5481sd_alloc_ctl_domain_table(struct sched_domain *sd)
5482{
5483 struct ctl_table *table = sd_alloc_ctl_entry(14);
5484
5485 if (table == NULL)
5486 return NULL;
5487
5488 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5489 sizeof(long), 0644, proc_doulongvec_minmax, false);
5490 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5491 sizeof(long), 0644, proc_doulongvec_minmax, false);
5492 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5493 sizeof(int), 0644, proc_dointvec_minmax, true);
5494 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5495 sizeof(int), 0644, proc_dointvec_minmax, true);
5496 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5497 sizeof(int), 0644, proc_dointvec_minmax, true);
5498 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5499 sizeof(int), 0644, proc_dointvec_minmax, true);
5500 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5501 sizeof(int), 0644, proc_dointvec_minmax, true);
5502 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5503 sizeof(int), 0644, proc_dointvec_minmax, false);
5504 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5505 sizeof(int), 0644, proc_dointvec_minmax, false);
5506 set_table_entry(&table[9], "cache_nice_tries",
5507 &sd->cache_nice_tries,
5508 sizeof(int), 0644, proc_dointvec_minmax, false);
5509 set_table_entry(&table[10], "flags", &sd->flags,
5510 sizeof(int), 0644, proc_dointvec_minmax, false);
5511 set_table_entry(&table[11], "max_newidle_lb_cost",
5512 &sd->max_newidle_lb_cost,
5513 sizeof(long), 0644, proc_doulongvec_minmax, false);
5514 set_table_entry(&table[12], "name", sd->name,
5515 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
5516 /* &table[13] is terminator */
5517
5518 return table;
5519}
5520
5521static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5522{
5523 struct ctl_table *entry, *table;
5524 struct sched_domain *sd;
5525 int domain_num = 0, i;
5526 char buf[32];
5527
5528 for_each_domain(cpu, sd)
5529 domain_num++;
5530 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5531 if (table == NULL)
5532 return NULL;
5533
5534 i = 0;
5535 for_each_domain(cpu, sd) {
5536 snprintf(buf, 32, "domain%d", i);
5537 entry->procname = kstrdup(buf, GFP_KERNEL);
5538 entry->mode = 0555;
5539 entry->child = sd_alloc_ctl_domain_table(sd);
5540 entry++;
5541 i++;
5542 }
5543 return table;
5544}
5545
5546static struct ctl_table_header *sd_sysctl_header;
5547static void register_sched_domain_sysctl(void)
5548{
5549 int i, cpu_num = num_possible_cpus();
5550 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5551 char buf[32];
5552
5553 WARN_ON(sd_ctl_dir[0].child);
5554 sd_ctl_dir[0].child = entry;
5555
5556 if (entry == NULL)
5557 return;
5558
5559 for_each_possible_cpu(i) {
5560 snprintf(buf, 32, "cpu%d", i);
5561 entry->procname = kstrdup(buf, GFP_KERNEL);
5562 entry->mode = 0555;
5563 entry->child = sd_alloc_ctl_cpu_table(i);
5564 entry++;
5565 }
5566
5567 WARN_ON(sd_sysctl_header);
5568 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5569}
5570
5571/* may be called multiple times per register */
5572static void unregister_sched_domain_sysctl(void)
5573{
5574 unregister_sysctl_table(sd_sysctl_header);
5575 sd_sysctl_header = NULL;
5576 if (sd_ctl_dir[0].child)
5577 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5578}
5579#else
5580static void register_sched_domain_sysctl(void)
5581{
5582}
5583static void unregister_sched_domain_sysctl(void)
5584{
5585}
5586#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
5587
5588static void set_rq_online(struct rq *rq) 5336static void set_rq_online(struct rq *rq)
5589{ 5337{
5590 if (!rq->online) { 5338 if (!rq->online) {
@@ -6176,11 +5924,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6176/* Setup the mask of cpus configured for isolated domains */ 5924/* Setup the mask of cpus configured for isolated domains */
6177static int __init isolated_cpu_setup(char *str) 5925static int __init isolated_cpu_setup(char *str)
6178{ 5926{
5927 int ret;
5928
6179 alloc_bootmem_cpumask_var(&cpu_isolated_map); 5929 alloc_bootmem_cpumask_var(&cpu_isolated_map);
6180 cpulist_parse(str, cpu_isolated_map); 5930 ret = cpulist_parse(str, cpu_isolated_map);
5931 if (ret) {
5932 pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
5933 return 0;
5934 }
6181 return 1; 5935 return 1;
6182} 5936}
6183
6184__setup("isolcpus=", isolated_cpu_setup); 5937__setup("isolcpus=", isolated_cpu_setup);
6185 5938
6186struct s_data { 5939struct s_data {
@@ -7863,11 +7616,9 @@ void sched_destroy_group(struct task_group *tg)
7863void sched_offline_group(struct task_group *tg) 7616void sched_offline_group(struct task_group *tg)
7864{ 7617{
7865 unsigned long flags; 7618 unsigned long flags;
7866 int i;
7867 7619
7868 /* end participation in shares distribution */ 7620 /* end participation in shares distribution */
7869 for_each_possible_cpu(i) 7621 unregister_fair_sched_group(tg);
7870 unregister_fair_sched_group(tg, i);
7871 7622
7872 spin_lock_irqsave(&task_group_lock, flags); 7623 spin_lock_irqsave(&task_group_lock, flags);
7873 list_del_rcu(&tg->list); 7624 list_del_rcu(&tg->list);
@@ -7893,7 +7644,7 @@ void sched_move_task(struct task_struct *tsk)
7893 queued = task_on_rq_queued(tsk); 7644 queued = task_on_rq_queued(tsk);
7894 7645
7895 if (queued) 7646 if (queued)
7896 dequeue_task(rq, tsk, DEQUEUE_SAVE); 7647 dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
7897 if (unlikely(running)) 7648 if (unlikely(running))
7898 put_prev_task(rq, tsk); 7649 put_prev_task(rq, tsk);
7899 7650
@@ -7917,7 +7668,7 @@ void sched_move_task(struct task_struct *tsk)
7917 if (unlikely(running)) 7668 if (unlikely(running))
7918 tsk->sched_class->set_curr_task(rq); 7669 tsk->sched_class->set_curr_task(rq);
7919 if (queued) 7670 if (queued)
7920 enqueue_task(rq, tsk, ENQUEUE_RESTORE); 7671 enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
7921 7672
7922 task_rq_unlock(rq, tsk, &flags); 7673 task_rq_unlock(rq, tsk, &flags);
7923} 7674}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index b2ab2ffb1adc..75f98c5498d5 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -262,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void)
262#ifdef CONFIG_PARAVIRT 262#ifdef CONFIG_PARAVIRT
263 if (static_key_false(&paravirt_steal_enabled)) { 263 if (static_key_false(&paravirt_steal_enabled)) {
264 u64 steal; 264 u64 steal;
265 cputime_t steal_ct; 265 unsigned long steal_jiffies;
266 266
267 steal = paravirt_steal_clock(smp_processor_id()); 267 steal = paravirt_steal_clock(smp_processor_id());
268 steal -= this_rq()->prev_steal_time; 268 steal -= this_rq()->prev_steal_time;
269 269
270 /* 270 /*
271 * cputime_t may be less precise than nsecs (eg: if it's 271 * steal is in nsecs but our caller is expecting steal
272 * based on jiffies). Lets cast the result to cputime 272 * time in jiffies. Lets cast the result to jiffies
273 * granularity and account the rest on the next rounds. 273 * granularity and account the rest on the next rounds.
274 */ 274 */
275 steal_ct = nsecs_to_cputime(steal); 275 steal_jiffies = nsecs_to_jiffies(steal);
276 this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); 276 this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
277 277
278 account_steal_time(steal_ct); 278 account_steal_time(jiffies_to_cputime(steal_jiffies));
279 return steal_ct; 279 return steal_jiffies;
280 } 280 }
281#endif 281#endif
282 return false; 282 return false;
@@ -668,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
668#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 668#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
669 669
670#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 670#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
671static unsigned long long vtime_delta(struct task_struct *tsk) 671static cputime_t vtime_delta(struct task_struct *tsk)
672{ 672{
673 unsigned long long clock; 673 unsigned long now = READ_ONCE(jiffies);
674 674
675 clock = local_clock(); 675 if (time_before(now, (unsigned long)tsk->vtime_snap))
676 if (clock < tsk->vtime_snap)
677 return 0; 676 return 0;
678 677
679 return clock - tsk->vtime_snap; 678 return jiffies_to_cputime(now - tsk->vtime_snap);
680} 679}
681 680
682static cputime_t get_vtime_delta(struct task_struct *tsk) 681static cputime_t get_vtime_delta(struct task_struct *tsk)
683{ 682{
684 unsigned long long delta = vtime_delta(tsk); 683 unsigned long now = READ_ONCE(jiffies);
684 unsigned long delta = now - tsk->vtime_snap;
685 685
686 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); 686 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
687 tsk->vtime_snap += delta; 687 tsk->vtime_snap = now;
688 688
689 /* CHECKME: always safe to convert nsecs to cputime? */ 689 return jiffies_to_cputime(delta);
690 return nsecs_to_cputime(delta);
691} 690}
692 691
693static void __vtime_account_system(struct task_struct *tsk) 692static void __vtime_account_system(struct task_struct *tsk)
@@ -699,6 +698,9 @@ static void __vtime_account_system(struct task_struct *tsk)
699 698
700void vtime_account_system(struct task_struct *tsk) 699void vtime_account_system(struct task_struct *tsk)
701{ 700{
701 if (!vtime_delta(tsk))
702 return;
703
702 write_seqcount_begin(&tsk->vtime_seqcount); 704 write_seqcount_begin(&tsk->vtime_seqcount);
703 __vtime_account_system(tsk); 705 __vtime_account_system(tsk);
704 write_seqcount_end(&tsk->vtime_seqcount); 706 write_seqcount_end(&tsk->vtime_seqcount);
@@ -707,7 +709,8 @@ void vtime_account_system(struct task_struct *tsk)
707void vtime_gen_account_irq_exit(struct task_struct *tsk) 709void vtime_gen_account_irq_exit(struct task_struct *tsk)
708{ 710{
709 write_seqcount_begin(&tsk->vtime_seqcount); 711 write_seqcount_begin(&tsk->vtime_seqcount);
710 __vtime_account_system(tsk); 712 if (vtime_delta(tsk))
713 __vtime_account_system(tsk);
711 if (context_tracking_in_user()) 714 if (context_tracking_in_user())
712 tsk->vtime_snap_whence = VTIME_USER; 715 tsk->vtime_snap_whence = VTIME_USER;
713 write_seqcount_end(&tsk->vtime_seqcount); 716 write_seqcount_end(&tsk->vtime_seqcount);
@@ -718,16 +721,19 @@ void vtime_account_user(struct task_struct *tsk)
718 cputime_t delta_cpu; 721 cputime_t delta_cpu;
719 722
720 write_seqcount_begin(&tsk->vtime_seqcount); 723 write_seqcount_begin(&tsk->vtime_seqcount);
721 delta_cpu = get_vtime_delta(tsk);
722 tsk->vtime_snap_whence = VTIME_SYS; 724 tsk->vtime_snap_whence = VTIME_SYS;
723 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); 725 if (vtime_delta(tsk)) {
726 delta_cpu = get_vtime_delta(tsk);
727 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
728 }
724 write_seqcount_end(&tsk->vtime_seqcount); 729 write_seqcount_end(&tsk->vtime_seqcount);
725} 730}
726 731
727void vtime_user_enter(struct task_struct *tsk) 732void vtime_user_enter(struct task_struct *tsk)
728{ 733{
729 write_seqcount_begin(&tsk->vtime_seqcount); 734 write_seqcount_begin(&tsk->vtime_seqcount);
730 __vtime_account_system(tsk); 735 if (vtime_delta(tsk))
736 __vtime_account_system(tsk);
731 tsk->vtime_snap_whence = VTIME_USER; 737 tsk->vtime_snap_whence = VTIME_USER;
732 write_seqcount_end(&tsk->vtime_seqcount); 738 write_seqcount_end(&tsk->vtime_seqcount);
733} 739}
@@ -742,7 +748,8 @@ void vtime_guest_enter(struct task_struct *tsk)
742 * that can thus safely catch up with a tickless delta. 748 * that can thus safely catch up with a tickless delta.
743 */ 749 */
744 write_seqcount_begin(&tsk->vtime_seqcount); 750 write_seqcount_begin(&tsk->vtime_seqcount);
745 __vtime_account_system(tsk); 751 if (vtime_delta(tsk))
752 __vtime_account_system(tsk);
746 current->flags |= PF_VCPU; 753 current->flags |= PF_VCPU;
747 write_seqcount_end(&tsk->vtime_seqcount); 754 write_seqcount_end(&tsk->vtime_seqcount);
748} 755}
@@ -772,7 +779,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
772 779
773 write_seqcount_begin(&current->vtime_seqcount); 780 write_seqcount_begin(&current->vtime_seqcount);
774 current->vtime_snap_whence = VTIME_SYS; 781 current->vtime_snap_whence = VTIME_SYS;
775 current->vtime_snap = sched_clock_cpu(smp_processor_id()); 782 current->vtime_snap = jiffies;
776 write_seqcount_end(&current->vtime_seqcount); 783 write_seqcount_end(&current->vtime_seqcount);
777} 784}
778 785
@@ -783,7 +790,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
783 local_irq_save(flags); 790 local_irq_save(flags);
784 write_seqcount_begin(&t->vtime_seqcount); 791 write_seqcount_begin(&t->vtime_seqcount);
785 t->vtime_snap_whence = VTIME_SYS; 792 t->vtime_snap_whence = VTIME_SYS;
786 t->vtime_snap = sched_clock_cpu(cpu); 793 t->vtime_snap = jiffies;
787 write_seqcount_end(&t->vtime_seqcount); 794 write_seqcount_end(&t->vtime_seqcount);
788 local_irq_restore(flags); 795 local_irq_restore(flags);
789} 796}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 57b939c81bce..c7a036facbe1 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -352,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
352 struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 352 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
353 struct rq *rq = rq_of_dl_rq(dl_rq); 353 struct rq *rq = rq_of_dl_rq(dl_rq);
354 354
355 WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); 355 WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
356
357 /*
358 * We are racing with the deadline timer. So, do nothing because
359 * the deadline timer handler will take care of properly recharging
360 * the runtime and postponing the deadline
361 */
362 if (dl_se->dl_throttled)
363 return;
356 364
357 /* 365 /*
358 * We use the regular wall clock time to set deadlines in the 366 * We use the regular wall clock time to set deadlines in the
@@ -361,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
361 */ 369 */
362 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 370 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
363 dl_se->runtime = pi_se->dl_runtime; 371 dl_se->runtime = pi_se->dl_runtime;
364 dl_se->dl_new = 0;
365} 372}
366 373
367/* 374/*
@@ -399,6 +406,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
399 dl_se->runtime = pi_se->dl_runtime; 406 dl_se->runtime = pi_se->dl_runtime;
400 } 407 }
401 408
409 if (dl_se->dl_yielded && dl_se->runtime > 0)
410 dl_se->runtime = 0;
411
402 /* 412 /*
403 * We keep moving the deadline away until we get some 413 * We keep moving the deadline away until we get some
404 * available runtime for the entity. This ensures correct 414 * available runtime for the entity. This ensures correct
@@ -500,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
500 struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 510 struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
501 struct rq *rq = rq_of_dl_rq(dl_rq); 511 struct rq *rq = rq_of_dl_rq(dl_rq);
502 512
503 /*
504 * The arrival of a new instance needs special treatment, i.e.,
505 * the actual scheduling parameters have to be "renewed".
506 */
507 if (dl_se->dl_new) {
508 setup_new_dl_entity(dl_se, pi_se);
509 return;
510 }
511
512 if (dl_time_before(dl_se->deadline, rq_clock(rq)) || 513 if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
513 dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { 514 dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
514 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 515 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
@@ -605,16 +606,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
605 } 606 }
606 607
607 /* 608 /*
608 * This is possible if switched_from_dl() raced against a running
609 * callback that took the above !dl_task() path and we've since then
610 * switched back into SCHED_DEADLINE.
611 *
612 * There's nothing to do except drop our task reference.
613 */
614 if (dl_se->dl_new)
615 goto unlock;
616
617 /*
618 * The task might have been boosted by someone else and might be in the 609 * The task might have been boosted by someone else and might be in the
619 * boosting/deboosting path, its not throttled. 610 * boosting/deboosting path, its not throttled.
620 */ 611 */
@@ -735,8 +726,11 @@ static void update_curr_dl(struct rq *rq)
735 * approach need further study. 726 * approach need further study.
736 */ 727 */
737 delta_exec = rq_clock_task(rq) - curr->se.exec_start; 728 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
738 if (unlikely((s64)delta_exec <= 0)) 729 if (unlikely((s64)delta_exec <= 0)) {
730 if (unlikely(dl_se->dl_yielded))
731 goto throttle;
739 return; 732 return;
733 }
740 734
741 schedstat_set(curr->se.statistics.exec_max, 735 schedstat_set(curr->se.statistics.exec_max,
742 max(curr->se.statistics.exec_max, delta_exec)); 736 max(curr->se.statistics.exec_max, delta_exec));
@@ -749,8 +743,10 @@ static void update_curr_dl(struct rq *rq)
749 743
750 sched_rt_avg_update(rq, delta_exec); 744 sched_rt_avg_update(rq, delta_exec);
751 745
752 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; 746 dl_se->runtime -= delta_exec;
753 if (dl_runtime_exceeded(dl_se)) { 747
748throttle:
749 if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
754 dl_se->dl_throttled = 1; 750 dl_se->dl_throttled = 1;
755 __dequeue_task_dl(rq, curr, 0); 751 __dequeue_task_dl(rq, curr, 0);
756 if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) 752 if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
@@ -917,7 +913,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
917 * parameters of the task might need updating. Otherwise, 913 * parameters of the task might need updating. Otherwise,
918 * we want a replenishment of its runtime. 914 * we want a replenishment of its runtime.
919 */ 915 */
920 if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) 916 if (flags & ENQUEUE_WAKEUP)
921 update_dl_entity(dl_se, pi_se); 917 update_dl_entity(dl_se, pi_se);
922 else if (flags & ENQUEUE_REPLENISH) 918 else if (flags & ENQUEUE_REPLENISH)
923 replenish_dl_entity(dl_se, pi_se); 919 replenish_dl_entity(dl_se, pi_se);
@@ -994,18 +990,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
994 */ 990 */
995static void yield_task_dl(struct rq *rq) 991static void yield_task_dl(struct rq *rq)
996{ 992{
997 struct task_struct *p = rq->curr;
998
999 /* 993 /*
1000 * We make the task go to sleep until its current deadline by 994 * We make the task go to sleep until its current deadline by
1001 * forcing its runtime to zero. This way, update_curr_dl() stops 995 * forcing its runtime to zero. This way, update_curr_dl() stops
1002 * it and the bandwidth timer will wake it up and will give it 996 * it and the bandwidth timer will wake it up and will give it
1003 * new scheduling parameters (thanks to dl_yielded=1). 997 * new scheduling parameters (thanks to dl_yielded=1).
1004 */ 998 */
1005 if (p->dl.runtime > 0) { 999 rq->curr->dl.dl_yielded = 1;
1006 rq->curr->dl.dl_yielded = 1; 1000
1007 p->dl.runtime = 0;
1008 }
1009 update_rq_clock(rq); 1001 update_rq_clock(rq);
1010 update_curr_dl(rq); 1002 update_curr_dl(rq);
1011 /* 1003 /*
@@ -1722,6 +1714,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
1722 */ 1714 */
1723static void switched_to_dl(struct rq *rq, struct task_struct *p) 1715static void switched_to_dl(struct rq *rq, struct task_struct *p)
1724{ 1716{
1717 if (dl_time_before(p->dl.deadline, rq_clock(rq)))
1718 setup_new_dl_entity(&p->dl, &p->dl);
1719
1725 if (task_on_rq_queued(p) && rq->curr != p) { 1720 if (task_on_rq_queued(p) && rq->curr != p) {
1726#ifdef CONFIG_SMP 1721#ifdef CONFIG_SMP
1727 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) 1722 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
@@ -1768,8 +1763,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
1768 */ 1763 */
1769 resched_curr(rq); 1764 resched_curr(rq);
1770#endif /* CONFIG_SMP */ 1765#endif /* CONFIG_SMP */
1771 } else 1766 }
1772 switched_to_dl(rq, p);
1773} 1767}
1774 1768
1775const struct sched_class dl_sched_class = { 1769const struct sched_class dl_sched_class = {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 641511771ae6..4fbc3bd5ff60 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -16,6 +16,7 @@
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18#include <linux/mempolicy.h> 18#include <linux/mempolicy.h>
19#include <linux/debugfs.h>
19 20
20#include "sched.h" 21#include "sched.h"
21 22
@@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec)
58 59
59#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 60#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
60 61
62#define SCHED_FEAT(name, enabled) \
63 #name ,
64
65static const char * const sched_feat_names[] = {
66#include "features.h"
67};
68
69#undef SCHED_FEAT
70
71static int sched_feat_show(struct seq_file *m, void *v)
72{
73 int i;
74
75 for (i = 0; i < __SCHED_FEAT_NR; i++) {
76 if (!(sysctl_sched_features & (1UL << i)))
77 seq_puts(m, "NO_");
78 seq_printf(m, "%s ", sched_feat_names[i]);
79 }
80 seq_puts(m, "\n");
81
82 return 0;
83}
84
85#ifdef HAVE_JUMP_LABEL
86
87#define jump_label_key__true STATIC_KEY_INIT_TRUE
88#define jump_label_key__false STATIC_KEY_INIT_FALSE
89
90#define SCHED_FEAT(name, enabled) \
91 jump_label_key__##enabled ,
92
93struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
94#include "features.h"
95};
96
97#undef SCHED_FEAT
98
99static void sched_feat_disable(int i)
100{
101 static_key_disable(&sched_feat_keys[i]);
102}
103
104static void sched_feat_enable(int i)
105{
106 static_key_enable(&sched_feat_keys[i]);
107}
108#else
109static void sched_feat_disable(int i) { };
110static void sched_feat_enable(int i) { };
111#endif /* HAVE_JUMP_LABEL */
112
113static int sched_feat_set(char *cmp)
114{
115 int i;
116 int neg = 0;
117
118 if (strncmp(cmp, "NO_", 3) == 0) {
119 neg = 1;
120 cmp += 3;
121 }
122
123 for (i = 0; i < __SCHED_FEAT_NR; i++) {
124 if (strcmp(cmp, sched_feat_names[i]) == 0) {
125 if (neg) {
126 sysctl_sched_features &= ~(1UL << i);
127 sched_feat_disable(i);
128 } else {
129 sysctl_sched_features |= (1UL << i);
130 sched_feat_enable(i);
131 }
132 break;
133 }
134 }
135
136 return i;
137}
138
139static ssize_t
140sched_feat_write(struct file *filp, const char __user *ubuf,
141 size_t cnt, loff_t *ppos)
142{
143 char buf[64];
144 char *cmp;
145 int i;
146 struct inode *inode;
147
148 if (cnt > 63)
149 cnt = 63;
150
151 if (copy_from_user(&buf, ubuf, cnt))
152 return -EFAULT;
153
154 buf[cnt] = 0;
155 cmp = strstrip(buf);
156
157 /* Ensure the static_key remains in a consistent state */
158 inode = file_inode(filp);
159 inode_lock(inode);
160 i = sched_feat_set(cmp);
161 inode_unlock(inode);
162 if (i == __SCHED_FEAT_NR)
163 return -EINVAL;
164
165 *ppos += cnt;
166
167 return cnt;
168}
169
170static int sched_feat_open(struct inode *inode, struct file *filp)
171{
172 return single_open(filp, sched_feat_show, NULL);
173}
174
175static const struct file_operations sched_feat_fops = {
176 .open = sched_feat_open,
177 .write = sched_feat_write,
178 .read = seq_read,
179 .llseek = seq_lseek,
180 .release = single_release,
181};
182
183static __init int sched_init_debug(void)
184{
185 debugfs_create_file("sched_features", 0644, NULL, NULL,
186 &sched_feat_fops);
187
188 return 0;
189}
190late_initcall(sched_init_debug);
191
192#ifdef CONFIG_SMP
193
194#ifdef CONFIG_SYSCTL
195
196static struct ctl_table sd_ctl_dir[] = {
197 {
198 .procname = "sched_domain",
199 .mode = 0555,
200 },
201 {}
202};
203
204static struct ctl_table sd_ctl_root[] = {
205 {
206 .procname = "kernel",
207 .mode = 0555,
208 .child = sd_ctl_dir,
209 },
210 {}
211};
212
213static struct ctl_table *sd_alloc_ctl_entry(int n)
214{
215 struct ctl_table *entry =
216 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
217
218 return entry;
219}
220
221static void sd_free_ctl_entry(struct ctl_table **tablep)
222{
223 struct ctl_table *entry;
224
225 /*
226 * In the intermediate directories, both the child directory and
227 * procname are dynamically allocated and could fail but the mode
228 * will always be set. In the lowest directory the names are
229 * static strings and all have proc handlers.
230 */
231 for (entry = *tablep; entry->mode; entry++) {
232 if (entry->child)
233 sd_free_ctl_entry(&entry->child);
234 if (entry->proc_handler == NULL)
235 kfree(entry->procname);
236 }
237
238 kfree(*tablep);
239 *tablep = NULL;
240}
241
242static int min_load_idx = 0;
243static int max_load_idx = CPU_LOAD_IDX_MAX-1;
244
245static void
246set_table_entry(struct ctl_table *entry,
247 const char *procname, void *data, int maxlen,
248 umode_t mode, proc_handler *proc_handler,
249 bool load_idx)
250{
251 entry->procname = procname;
252 entry->data = data;
253 entry->maxlen = maxlen;
254 entry->mode = mode;
255 entry->proc_handler = proc_handler;
256
257 if (load_idx) {
258 entry->extra1 = &min_load_idx;
259 entry->extra2 = &max_load_idx;
260 }
261}
262
263static struct ctl_table *
264sd_alloc_ctl_domain_table(struct sched_domain *sd)
265{
266 struct ctl_table *table = sd_alloc_ctl_entry(14);
267
268 if (table == NULL)
269 return NULL;
270
271 set_table_entry(&table[0], "min_interval", &sd->min_interval,
272 sizeof(long), 0644, proc_doulongvec_minmax, false);
273 set_table_entry(&table[1], "max_interval", &sd->max_interval,
274 sizeof(long), 0644, proc_doulongvec_minmax, false);
275 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
276 sizeof(int), 0644, proc_dointvec_minmax, true);
277 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
278 sizeof(int), 0644, proc_dointvec_minmax, true);
279 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
280 sizeof(int), 0644, proc_dointvec_minmax, true);
281 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
282 sizeof(int), 0644, proc_dointvec_minmax, true);
283 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
284 sizeof(int), 0644, proc_dointvec_minmax, true);
285 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
286 sizeof(int), 0644, proc_dointvec_minmax, false);
287 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
288 sizeof(int), 0644, proc_dointvec_minmax, false);
289 set_table_entry(&table[9], "cache_nice_tries",
290 &sd->cache_nice_tries,
291 sizeof(int), 0644, proc_dointvec_minmax, false);
292 set_table_entry(&table[10], "flags", &sd->flags,
293 sizeof(int), 0644, proc_dointvec_minmax, false);
294 set_table_entry(&table[11], "max_newidle_lb_cost",
295 &sd->max_newidle_lb_cost,
296 sizeof(long), 0644, proc_doulongvec_minmax, false);
297 set_table_entry(&table[12], "name", sd->name,
298 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
299 /* &table[13] is terminator */
300
301 return table;
302}
303
304static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
305{
306 struct ctl_table *entry, *table;
307 struct sched_domain *sd;
308 int domain_num = 0, i;
309 char buf[32];
310
311 for_each_domain(cpu, sd)
312 domain_num++;
313 entry = table = sd_alloc_ctl_entry(domain_num + 1);
314 if (table == NULL)
315 return NULL;
316
317 i = 0;
318 for_each_domain(cpu, sd) {
319 snprintf(buf, 32, "domain%d", i);
320 entry->procname = kstrdup(buf, GFP_KERNEL);
321 entry->mode = 0555;
322 entry->child = sd_alloc_ctl_domain_table(sd);
323 entry++;
324 i++;
325 }
326 return table;
327}
328
329static struct ctl_table_header *sd_sysctl_header;
330void register_sched_domain_sysctl(void)
331{
332 int i, cpu_num = num_possible_cpus();
333 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
334 char buf[32];
335
336 WARN_ON(sd_ctl_dir[0].child);
337 sd_ctl_dir[0].child = entry;
338
339 if (entry == NULL)
340 return;
341
342 for_each_possible_cpu(i) {
343 snprintf(buf, 32, "cpu%d", i);
344 entry->procname = kstrdup(buf, GFP_KERNEL);
345 entry->mode = 0555;
346 entry->child = sd_alloc_ctl_cpu_table(i);
347 entry++;
348 }
349
350 WARN_ON(sd_sysctl_header);
351 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
352}
353
354/* may be called multiple times per register */
355void unregister_sched_domain_sysctl(void)
356{
357 unregister_sysctl_table(sd_sysctl_header);
358 sd_sysctl_header = NULL;
359 if (sd_ctl_dir[0].child)
360 sd_free_ctl_entry(&sd_ctl_dir[0].child);
361}
362#endif /* CONFIG_SYSCTL */
363#endif /* CONFIG_SMP */
364
61#ifdef CONFIG_FAIR_GROUP_SCHED 365#ifdef CONFIG_FAIR_GROUP_SCHED
62static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) 366static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
63{ 367{
@@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
75 PN(se->vruntime); 379 PN(se->vruntime);
76 PN(se->sum_exec_runtime); 380 PN(se->sum_exec_runtime);
77#ifdef CONFIG_SCHEDSTATS 381#ifdef CONFIG_SCHEDSTATS
78 PN(se->statistics.wait_start); 382 if (schedstat_enabled()) {
79 PN(se->statistics.sleep_start); 383 PN(se->statistics.wait_start);
80 PN(se->statistics.block_start); 384 PN(se->statistics.sleep_start);
81 PN(se->statistics.sleep_max); 385 PN(se->statistics.block_start);
82 PN(se->statistics.block_max); 386 PN(se->statistics.sleep_max);
83 PN(se->statistics.exec_max); 387 PN(se->statistics.block_max);
84 PN(se->statistics.slice_max); 388 PN(se->statistics.exec_max);
85 PN(se->statistics.wait_max); 389 PN(se->statistics.slice_max);
86 PN(se->statistics.wait_sum); 390 PN(se->statistics.wait_max);
87 P(se->statistics.wait_count); 391 PN(se->statistics.wait_sum);
392 P(se->statistics.wait_count);
393 }
88#endif 394#endif
89 P(se->load.weight); 395 P(se->load.weight);
90#ifdef CONFIG_SMP 396#ifdef CONFIG_SMP
@@ -122,10 +428,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
122 (long long)(p->nvcsw + p->nivcsw), 428 (long long)(p->nvcsw + p->nivcsw),
123 p->prio); 429 p->prio);
124#ifdef CONFIG_SCHEDSTATS 430#ifdef CONFIG_SCHEDSTATS
125 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 431 if (schedstat_enabled()) {
126 SPLIT_NS(p->se.statistics.wait_sum), 432 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
127 SPLIT_NS(p->se.sum_exec_runtime), 433 SPLIT_NS(p->se.statistics.wait_sum),
128 SPLIT_NS(p->se.statistics.sum_sleep_runtime)); 434 SPLIT_NS(p->se.sum_exec_runtime),
435 SPLIT_NS(p->se.statistics.sum_sleep_runtime));
436 }
129#else 437#else
130 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 438 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
131 0LL, 0L, 439 0LL, 0L,
@@ -258,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
258 566
259void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) 567void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
260{ 568{
569 struct dl_bw *dl_bw;
570
261 SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); 571 SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
262 SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); 572 SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
573#ifdef CONFIG_SMP
574 dl_bw = &cpu_rq(cpu)->rd->dl_bw;
575#else
576 dl_bw = &dl_rq->dl_bw;
577#endif
578 SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
579 SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
263} 580}
264 581
265extern __read_mostly int sched_clock_running; 582extern __read_mostly int sched_clock_running;
@@ -313,17 +630,18 @@ do { \
313#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); 630#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
314#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); 631#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
315 632
316 P(yld_count);
317
318 P(sched_count);
319 P(sched_goidle);
320#ifdef CONFIG_SMP 633#ifdef CONFIG_SMP
321 P64(avg_idle); 634 P64(avg_idle);
322 P64(max_idle_balance_cost); 635 P64(max_idle_balance_cost);
323#endif 636#endif
324 637
325 P(ttwu_count); 638 if (schedstat_enabled()) {
326 P(ttwu_local); 639 P(yld_count);
640 P(sched_count);
641 P(sched_goidle);
642 P(ttwu_count);
643 P(ttwu_local);
644 }
327 645
328#undef P 646#undef P
329#undef P64 647#undef P64
@@ -569,38 +887,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
569 nr_switches = p->nvcsw + p->nivcsw; 887 nr_switches = p->nvcsw + p->nivcsw;
570 888
571#ifdef CONFIG_SCHEDSTATS 889#ifdef CONFIG_SCHEDSTATS
572 PN(se.statistics.sum_sleep_runtime);
573 PN(se.statistics.wait_start);
574 PN(se.statistics.sleep_start);
575 PN(se.statistics.block_start);
576 PN(se.statistics.sleep_max);
577 PN(se.statistics.block_max);
578 PN(se.statistics.exec_max);
579 PN(se.statistics.slice_max);
580 PN(se.statistics.wait_max);
581 PN(se.statistics.wait_sum);
582 P(se.statistics.wait_count);
583 PN(se.statistics.iowait_sum);
584 P(se.statistics.iowait_count);
585 P(se.nr_migrations); 890 P(se.nr_migrations);
586 P(se.statistics.nr_migrations_cold);
587 P(se.statistics.nr_failed_migrations_affine);
588 P(se.statistics.nr_failed_migrations_running);
589 P(se.statistics.nr_failed_migrations_hot);
590 P(se.statistics.nr_forced_migrations);
591 P(se.statistics.nr_wakeups);
592 P(se.statistics.nr_wakeups_sync);
593 P(se.statistics.nr_wakeups_migrate);
594 P(se.statistics.nr_wakeups_local);
595 P(se.statistics.nr_wakeups_remote);
596 P(se.statistics.nr_wakeups_affine);
597 P(se.statistics.nr_wakeups_affine_attempts);
598 P(se.statistics.nr_wakeups_passive);
599 P(se.statistics.nr_wakeups_idle);
600 891
601 { 892 if (schedstat_enabled()) {
602 u64 avg_atom, avg_per_cpu; 893 u64 avg_atom, avg_per_cpu;
603 894
895 PN(se.statistics.sum_sleep_runtime);
896 PN(se.statistics.wait_start);
897 PN(se.statistics.sleep_start);
898 PN(se.statistics.block_start);
899 PN(se.statistics.sleep_max);
900 PN(se.statistics.block_max);
901 PN(se.statistics.exec_max);
902 PN(se.statistics.slice_max);
903 PN(se.statistics.wait_max);
904 PN(se.statistics.wait_sum);
905 P(se.statistics.wait_count);
906 PN(se.statistics.iowait_sum);
907 P(se.statistics.iowait_count);
908 P(se.statistics.nr_migrations_cold);
909 P(se.statistics.nr_failed_migrations_affine);
910 P(se.statistics.nr_failed_migrations_running);
911 P(se.statistics.nr_failed_migrations_hot);
912 P(se.statistics.nr_forced_migrations);
913 P(se.statistics.nr_wakeups);
914 P(se.statistics.nr_wakeups_sync);
915 P(se.statistics.nr_wakeups_migrate);
916 P(se.statistics.nr_wakeups_local);
917 P(se.statistics.nr_wakeups_remote);
918 P(se.statistics.nr_wakeups_affine);
919 P(se.statistics.nr_wakeups_affine_attempts);
920 P(se.statistics.nr_wakeups_passive);
921 P(se.statistics.nr_wakeups_idle);
922
604 avg_atom = p->se.sum_exec_runtime; 923 avg_atom = p->se.sum_exec_runtime;
605 if (nr_switches) 924 if (nr_switches)
606 avg_atom = div64_ul(avg_atom, nr_switches); 925 avg_atom = div64_ul(avg_atom, nr_switches);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56b7d4b83947..33130529e9b5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,8 +20,8 @@
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra 20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
21 */ 21 */
22 22
23#include <linux/latencytop.h>
24#include <linux/sched.h> 23#include <linux/sched.h>
24#include <linux/latencytop.h>
25#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/cpuidle.h> 26#include <linux/cpuidle.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
@@ -755,7 +755,9 @@ static void
755update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 755update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
756{ 756{
757 struct task_struct *p; 757 struct task_struct *p;
758 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; 758 u64 delta;
759
760 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
759 761
760 if (entity_is_task(se)) { 762 if (entity_is_task(se)) {
761 p = task_of(se); 763 p = task_of(se);
@@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
776 se->statistics.wait_sum += delta; 778 se->statistics.wait_sum += delta;
777 se->statistics.wait_start = 0; 779 se->statistics.wait_start = 0;
778} 780}
779#else
780static inline void
781update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
782{
783}
784
785static inline void
786update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
787{
788}
789#endif
790 781
791/* 782/*
792 * Task is being enqueued - update stats: 783 * Task is being enqueued - update stats:
793 */ 784 */
794static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 785static inline void
786update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
795{ 787{
796 /* 788 /*
797 * Are we enqueueing a waiting task? (for current tasks 789 * Are we enqueueing a waiting task? (for current tasks
@@ -802,7 +794,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
802} 794}
803 795
804static inline void 796static inline void
805update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 797update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
806{ 798{
807 /* 799 /*
808 * Mark the end of the wait period if dequeueing a 800 * Mark the end of the wait period if dequeueing a
@@ -810,8 +802,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
810 */ 802 */
811 if (se != cfs_rq->curr) 803 if (se != cfs_rq->curr)
812 update_stats_wait_end(cfs_rq, se); 804 update_stats_wait_end(cfs_rq, se);
805
806 if (flags & DEQUEUE_SLEEP) {
807 if (entity_is_task(se)) {
808 struct task_struct *tsk = task_of(se);
809
810 if (tsk->state & TASK_INTERRUPTIBLE)
811 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
812 if (tsk->state & TASK_UNINTERRUPTIBLE)
813 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
814 }
815 }
816
817}
818#else
819static inline void
820update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
821{
813} 822}
814 823
824static inline void
825update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
826{
827}
828
829static inline void
830update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
831{
832}
833
834static inline void
835update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
836{
837}
838#endif
839
815/* 840/*
816 * We are picking a new current task - update its stats: 841 * We are picking a new current task - update its stats:
817 */ 842 */
@@ -907,10 +932,11 @@ struct numa_group {
907 spinlock_t lock; /* nr_tasks, tasks */ 932 spinlock_t lock; /* nr_tasks, tasks */
908 int nr_tasks; 933 int nr_tasks;
909 pid_t gid; 934 pid_t gid;
935 int active_nodes;
910 936
911 struct rcu_head rcu; 937 struct rcu_head rcu;
912 nodemask_t active_nodes;
913 unsigned long total_faults; 938 unsigned long total_faults;
939 unsigned long max_faults_cpu;
914 /* 940 /*
915 * Faults_cpu is used to decide whether memory should move 941 * Faults_cpu is used to decide whether memory should move
916 * towards the CPU. As a consequence, these stats are weighted 942 * towards the CPU. As a consequence, these stats are weighted
@@ -969,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
969 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; 995 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
970} 996}
971 997
998/*
999 * A node triggering more than 1/3 as many NUMA faults as the maximum is
1000 * considered part of a numa group's pseudo-interleaving set. Migrations
1001 * between these nodes are slowed down, to allow things to settle down.
1002 */
1003#define ACTIVE_NODE_FRACTION 3
1004
1005static bool numa_is_active_node(int nid, struct numa_group *ng)
1006{
1007 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1008}
1009
972/* Handle placement on systems where not all nodes are directly connected. */ 1010/* Handle placement on systems where not all nodes are directly connected. */
973static unsigned long score_nearby_nodes(struct task_struct *p, int nid, 1011static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
974 int maxdist, bool task) 1012 int maxdist, bool task)
@@ -1118,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1118 return true; 1156 return true;
1119 1157
1120 /* 1158 /*
1121 * Do not migrate if the destination is not a node that 1159 * Destination node is much more heavily used than the source
1122 * is actively used by this numa group. 1160 * node? Allow migration.
1123 */ 1161 */
1124 if (!node_isset(dst_nid, ng->active_nodes)) 1162 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1125 return false; 1163 ACTIVE_NODE_FRACTION)
1126
1127 /*
1128 * Source is a node that is not actively used by this
1129 * numa group, while the destination is. Migrate.
1130 */
1131 if (!node_isset(src_nid, ng->active_nodes))
1132 return true; 1164 return true;
1133 1165
1134 /* 1166 /*
1135 * Both source and destination are nodes in active 1167 * Distribute memory according to CPU & memory use on each node,
1136 * use by this numa group. Maximize memory bandwidth 1168 * with 3/4 hysteresis to avoid unnecessary memory migrations:
1137 * by migrating from more heavily used groups, to less 1169 *
1138 * heavily used ones, spreading the load around. 1170 * faults_cpu(dst) 3 faults_cpu(src)
1139 * Use a 1/4 hysteresis to avoid spurious page movement. 1171 * --------------- * - > ---------------
1172 * faults_mem(dst) 4 faults_mem(src)
1140 */ 1173 */
1141 return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); 1174 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1175 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1142} 1176}
1143 1177
1144static unsigned long weighted_cpuload(const int cpu); 1178static unsigned long weighted_cpuload(const int cpu);
@@ -1484,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)
1484 1518
1485 .best_task = NULL, 1519 .best_task = NULL,
1486 .best_imp = 0, 1520 .best_imp = 0,
1487 .best_cpu = -1 1521 .best_cpu = -1,
1488 }; 1522 };
1489 struct sched_domain *sd; 1523 struct sched_domain *sd;
1490 unsigned long taskweight, groupweight; 1524 unsigned long taskweight, groupweight;
@@ -1536,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)
1536 * multiple NUMA nodes; in order to better consolidate the group, 1570 * multiple NUMA nodes; in order to better consolidate the group,
1537 * we need to check other locations. 1571 * we need to check other locations.
1538 */ 1572 */
1539 if (env.best_cpu == -1 || (p->numa_group && 1573 if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
1540 nodes_weight(p->numa_group->active_nodes) > 1)) {
1541 for_each_online_node(nid) { 1574 for_each_online_node(nid) {
1542 if (nid == env.src_nid || nid == p->numa_preferred_nid) 1575 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1543 continue; 1576 continue;
@@ -1572,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)
1572 * trying for a better one later. Do not set the preferred node here. 1605 * trying for a better one later. Do not set the preferred node here.
1573 */ 1606 */
1574 if (p->numa_group) { 1607 if (p->numa_group) {
1608 struct numa_group *ng = p->numa_group;
1609
1575 if (env.best_cpu == -1) 1610 if (env.best_cpu == -1)
1576 nid = env.src_nid; 1611 nid = env.src_nid;
1577 else 1612 else
1578 nid = env.dst_nid; 1613 nid = env.dst_nid;
1579 1614
1580 if (node_isset(nid, p->numa_group->active_nodes)) 1615 if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
1581 sched_setnuma(p, env.dst_nid); 1616 sched_setnuma(p, env.dst_nid);
1582 } 1617 }
1583 1618
@@ -1627,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)
1627} 1662}
1628 1663
1629/* 1664/*
1630 * Find the nodes on which the workload is actively running. We do this by 1665 * Find out how many nodes on the workload is actively running on. Do this by
1631 * tracking the nodes from which NUMA hinting faults are triggered. This can 1666 * tracking the nodes from which NUMA hinting faults are triggered. This can
1632 * be different from the set of nodes where the workload's memory is currently 1667 * be different from the set of nodes where the workload's memory is currently
1633 * located. 1668 * located.
1634 *
1635 * The bitmask is used to make smarter decisions on when to do NUMA page
1636 * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1637 * are added when they cause over 6/16 of the maximum number of faults, but
1638 * only removed when they drop below 3/16.
1639 */ 1669 */
1640static void update_numa_active_node_mask(struct numa_group *numa_group) 1670static void numa_group_count_active_nodes(struct numa_group *numa_group)
1641{ 1671{
1642 unsigned long faults, max_faults = 0; 1672 unsigned long faults, max_faults = 0;
1643 int nid; 1673 int nid, active_nodes = 0;
1644 1674
1645 for_each_online_node(nid) { 1675 for_each_online_node(nid) {
1646 faults = group_faults_cpu(numa_group, nid); 1676 faults = group_faults_cpu(numa_group, nid);
@@ -1650,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
1650 1680
1651 for_each_online_node(nid) { 1681 for_each_online_node(nid) {
1652 faults = group_faults_cpu(numa_group, nid); 1682 faults = group_faults_cpu(numa_group, nid);
1653 if (!node_isset(nid, numa_group->active_nodes)) { 1683 if (faults * ACTIVE_NODE_FRACTION > max_faults)
1654 if (faults > max_faults * 6 / 16) 1684 active_nodes++;
1655 node_set(nid, numa_group->active_nodes);
1656 } else if (faults < max_faults * 3 / 16)
1657 node_clear(nid, numa_group->active_nodes);
1658 } 1685 }
1686
1687 numa_group->max_faults_cpu = max_faults;
1688 numa_group->active_nodes = active_nodes;
1659} 1689}
1660 1690
1661/* 1691/*
@@ -1946,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)
1946 update_task_scan_period(p, fault_types[0], fault_types[1]); 1976 update_task_scan_period(p, fault_types[0], fault_types[1]);
1947 1977
1948 if (p->numa_group) { 1978 if (p->numa_group) {
1949 update_numa_active_node_mask(p->numa_group); 1979 numa_group_count_active_nodes(p->numa_group);
1950 spin_unlock_irq(group_lock); 1980 spin_unlock_irq(group_lock);
1951 max_nid = preferred_group_nid(p, max_group_nid); 1981 max_nid = preferred_group_nid(p, max_group_nid);
1952 } 1982 }
@@ -1990,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1990 return; 2020 return;
1991 2021
1992 atomic_set(&grp->refcount, 1); 2022 atomic_set(&grp->refcount, 1);
2023 grp->active_nodes = 1;
2024 grp->max_faults_cpu = 0;
1993 spin_lock_init(&grp->lock); 2025 spin_lock_init(&grp->lock);
1994 grp->gid = p->pid; 2026 grp->gid = p->pid;
1995 /* Second half of the array tracks nids where faults happen */ 2027 /* Second half of the array tracks nids where faults happen */
1996 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * 2028 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
1997 nr_node_ids; 2029 nr_node_ids;
1998 2030
1999 node_set(task_node(current), grp->active_nodes);
2000
2001 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) 2031 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2002 grp->faults[i] = p->numa_faults[i]; 2032 grp->faults[i] = p->numa_faults[i];
2003 2033
@@ -2111,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2111 bool migrated = flags & TNF_MIGRATED; 2141 bool migrated = flags & TNF_MIGRATED;
2112 int cpu_node = task_node(current); 2142 int cpu_node = task_node(current);
2113 int local = !!(flags & TNF_FAULT_LOCAL); 2143 int local = !!(flags & TNF_FAULT_LOCAL);
2144 struct numa_group *ng;
2114 int priv; 2145 int priv;
2115 2146
2116 if (!static_branch_likely(&sched_numa_balancing)) 2147 if (!static_branch_likely(&sched_numa_balancing))
@@ -2151,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2151 * actively using should be counted as local. This allows the 2182 * actively using should be counted as local. This allows the
2152 * scan rate to slow down when a workload has settled down. 2183 * scan rate to slow down when a workload has settled down.
2153 */ 2184 */
2154 if (!priv && !local && p->numa_group && 2185 ng = p->numa_group;
2155 node_isset(cpu_node, p->numa_group->active_nodes) && 2186 if (!priv && !local && ng && ng->active_nodes > 1 &&
2156 node_isset(mem_node, p->numa_group->active_nodes)) 2187 numa_is_active_node(cpu_node, ng) &&
2188 numa_is_active_node(mem_node, ng))
2157 local = 1; 2189 local = 1;
2158 2190
2159 task_numa_placement(p); 2191 task_numa_placement(p);
@@ -3102,6 +3134,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3102 3134
3103static void check_enqueue_throttle(struct cfs_rq *cfs_rq); 3135static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3104 3136
3137static inline void check_schedstat_required(void)
3138{
3139#ifdef CONFIG_SCHEDSTATS
3140 if (schedstat_enabled())
3141 return;
3142
3143 /* Force schedstat enabled if a dependent tracepoint is active */
3144 if (trace_sched_stat_wait_enabled() ||
3145 trace_sched_stat_sleep_enabled() ||
3146 trace_sched_stat_iowait_enabled() ||
3147 trace_sched_stat_blocked_enabled() ||
3148 trace_sched_stat_runtime_enabled()) {
3149 pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3150 "stat_blocked and stat_runtime require the "
3151 "kernel parameter schedstats=enabled or "
3152 "kernel.sched_schedstats=1\n");
3153 }
3154#endif
3155}
3156
3105static void 3157static void
3106enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 3158enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3107{ 3159{
@@ -3122,11 +3174,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3122 3174
3123 if (flags & ENQUEUE_WAKEUP) { 3175 if (flags & ENQUEUE_WAKEUP) {
3124 place_entity(cfs_rq, se, 0); 3176 place_entity(cfs_rq, se, 0);
3125 enqueue_sleeper(cfs_rq, se); 3177 if (schedstat_enabled())
3178 enqueue_sleeper(cfs_rq, se);
3126 } 3179 }
3127 3180
3128 update_stats_enqueue(cfs_rq, se); 3181 check_schedstat_required();
3129 check_spread(cfs_rq, se); 3182 if (schedstat_enabled()) {
3183 update_stats_enqueue(cfs_rq, se);
3184 check_spread(cfs_rq, se);
3185 }
3130 if (se != cfs_rq->curr) 3186 if (se != cfs_rq->curr)
3131 __enqueue_entity(cfs_rq, se); 3187 __enqueue_entity(cfs_rq, se);
3132 se->on_rq = 1; 3188 se->on_rq = 1;
@@ -3193,19 +3249,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3193 update_curr(cfs_rq); 3249 update_curr(cfs_rq);
3194 dequeue_entity_load_avg(cfs_rq, se); 3250 dequeue_entity_load_avg(cfs_rq, se);
3195 3251
3196 update_stats_dequeue(cfs_rq, se); 3252 if (schedstat_enabled())
3197 if (flags & DEQUEUE_SLEEP) { 3253 update_stats_dequeue(cfs_rq, se, flags);
3198#ifdef CONFIG_SCHEDSTATS
3199 if (entity_is_task(se)) {
3200 struct task_struct *tsk = task_of(se);
3201
3202 if (tsk->state & TASK_INTERRUPTIBLE)
3203 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
3204 if (tsk->state & TASK_UNINTERRUPTIBLE)
3205 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
3206 }
3207#endif
3208 }
3209 3254
3210 clear_buddies(cfs_rq, se); 3255 clear_buddies(cfs_rq, se);
3211 3256
@@ -3279,7 +3324,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3279 * a CPU. So account for the time it spent waiting on the 3324 * a CPU. So account for the time it spent waiting on the
3280 * runqueue. 3325 * runqueue.
3281 */ 3326 */
3282 update_stats_wait_end(cfs_rq, se); 3327 if (schedstat_enabled())
3328 update_stats_wait_end(cfs_rq, se);
3283 __dequeue_entity(cfs_rq, se); 3329 __dequeue_entity(cfs_rq, se);
3284 update_load_avg(se, 1); 3330 update_load_avg(se, 1);
3285 } 3331 }
@@ -3292,7 +3338,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3292 * least twice that of our own weight (i.e. dont track it 3338 * least twice that of our own weight (i.e. dont track it
3293 * when there are only lesser-weight tasks around): 3339 * when there are only lesser-weight tasks around):
3294 */ 3340 */
3295 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 3341 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3296 se->statistics.slice_max = max(se->statistics.slice_max, 3342 se->statistics.slice_max = max(se->statistics.slice_max,
3297 se->sum_exec_runtime - se->prev_sum_exec_runtime); 3343 se->sum_exec_runtime - se->prev_sum_exec_runtime);
3298 } 3344 }
@@ -3375,9 +3421,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3375 /* throttle cfs_rqs exceeding runtime */ 3421 /* throttle cfs_rqs exceeding runtime */
3376 check_cfs_rq_runtime(cfs_rq); 3422 check_cfs_rq_runtime(cfs_rq);
3377 3423
3378 check_spread(cfs_rq, prev); 3424 if (schedstat_enabled()) {
3425 check_spread(cfs_rq, prev);
3426 if (prev->on_rq)
3427 update_stats_wait_start(cfs_rq, prev);
3428 }
3429
3379 if (prev->on_rq) { 3430 if (prev->on_rq) {
3380 update_stats_wait_start(cfs_rq, prev);
3381 /* Put 'current' back into the tree. */ 3431 /* Put 'current' back into the tree. */
3382 __enqueue_entity(cfs_rq, prev); 3432 __enqueue_entity(cfs_rq, prev);
3383 /* in !on_rq case, update occurred at dequeue */ 3433 /* in !on_rq case, update occurred at dequeue */
@@ -4459,9 +4509,17 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
4459 4509
4460 /* scale is effectively 1 << i now, and >> i divides by scale */ 4510 /* scale is effectively 1 << i now, and >> i divides by scale */
4461 4511
4462 old_load = this_rq->cpu_load[i] - tickless_load; 4512 old_load = this_rq->cpu_load[i];
4463 old_load = decay_load_missed(old_load, pending_updates - 1, i); 4513 old_load = decay_load_missed(old_load, pending_updates - 1, i);
4464 old_load += tickless_load; 4514 if (tickless_load) {
4515 old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
4516 /*
4517 * old_load can never be a negative value because a
4518 * decayed tickless_load cannot be greater than the
4519 * original tickless_load.
4520 */
4521 old_load += tickless_load;
4522 }
4465 new_load = this_load; 4523 new_load = this_load;
4466 /* 4524 /*
4467 * Round up the averaging division if load is increasing. This 4525 * Round up the averaging division if load is increasing. This
@@ -4484,6 +4542,25 @@ static unsigned long weighted_cpuload(const int cpu)
4484} 4542}
4485 4543
4486#ifdef CONFIG_NO_HZ_COMMON 4544#ifdef CONFIG_NO_HZ_COMMON
4545static void __update_cpu_load_nohz(struct rq *this_rq,
4546 unsigned long curr_jiffies,
4547 unsigned long load,
4548 int active)
4549{
4550 unsigned long pending_updates;
4551
4552 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4553 if (pending_updates) {
4554 this_rq->last_load_update_tick = curr_jiffies;
4555 /*
4556 * In the regular NOHZ case, we were idle, this means load 0.
4557 * In the NOHZ_FULL case, we were non-idle, we should consider
4558 * its weighted load.
4559 */
4560 __update_cpu_load(this_rq, load, pending_updates, active);
4561 }
4562}
4563
4487/* 4564/*
4488 * There is no sane way to deal with nohz on smp when using jiffies because the 4565 * There is no sane way to deal with nohz on smp when using jiffies because the
4489 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading 4566 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -4501,22 +4578,15 @@ static unsigned long weighted_cpuload(const int cpu)
4501 * Called from nohz_idle_balance() to update the load ratings before doing the 4578 * Called from nohz_idle_balance() to update the load ratings before doing the
4502 * idle balance. 4579 * idle balance.
4503 */ 4580 */
4504static void update_idle_cpu_load(struct rq *this_rq) 4581static void update_cpu_load_idle(struct rq *this_rq)
4505{ 4582{
4506 unsigned long curr_jiffies = READ_ONCE(jiffies);
4507 unsigned long load = weighted_cpuload(cpu_of(this_rq));
4508 unsigned long pending_updates;
4509
4510 /* 4583 /*
4511 * bail if there's load or we're actually up-to-date. 4584 * bail if there's load or we're actually up-to-date.
4512 */ 4585 */
4513 if (load || curr_jiffies == this_rq->last_load_update_tick) 4586 if (weighted_cpuload(cpu_of(this_rq)))
4514 return; 4587 return;
4515 4588
4516 pending_updates = curr_jiffies - this_rq->last_load_update_tick; 4589 __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
4517 this_rq->last_load_update_tick = curr_jiffies;
4518
4519 __update_cpu_load(this_rq, load, pending_updates, 0);
4520} 4590}
4521 4591
4522/* 4592/*
@@ -4527,22 +4597,12 @@ void update_cpu_load_nohz(int active)
4527 struct rq *this_rq = this_rq(); 4597 struct rq *this_rq = this_rq();
4528 unsigned long curr_jiffies = READ_ONCE(jiffies); 4598 unsigned long curr_jiffies = READ_ONCE(jiffies);
4529 unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; 4599 unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
4530 unsigned long pending_updates;
4531 4600
4532 if (curr_jiffies == this_rq->last_load_update_tick) 4601 if (curr_jiffies == this_rq->last_load_update_tick)
4533 return; 4602 return;
4534 4603
4535 raw_spin_lock(&this_rq->lock); 4604 raw_spin_lock(&this_rq->lock);
4536 pending_updates = curr_jiffies - this_rq->last_load_update_tick; 4605 __update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
4537 if (pending_updates) {
4538 this_rq->last_load_update_tick = curr_jiffies;
4539 /*
4540 * In the regular NOHZ case, we were idle, this means load 0.
4541 * In the NOHZ_FULL case, we were non-idle, we should consider
4542 * its weighted load.
4543 */
4544 __update_cpu_load(this_rq, load, pending_updates, active);
4545 }
4546 raw_spin_unlock(&this_rq->lock); 4606 raw_spin_unlock(&this_rq->lock);
4547} 4607}
4548#endif /* CONFIG_NO_HZ */ 4608#endif /* CONFIG_NO_HZ */
@@ -4554,7 +4614,7 @@ void update_cpu_load_active(struct rq *this_rq)
4554{ 4614{
4555 unsigned long load = weighted_cpuload(cpu_of(this_rq)); 4615 unsigned long load = weighted_cpuload(cpu_of(this_rq));
4556 /* 4616 /*
4557 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). 4617 * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
4558 */ 4618 */
4559 this_rq->last_load_update_tick = jiffies; 4619 this_rq->last_load_update_tick = jiffies;
4560 __update_cpu_load(this_rq, load, 1, 1); 4620 __update_cpu_load(this_rq, load, 1, 1);
@@ -7848,7 +7908,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
7848 if (time_after_eq(jiffies, rq->next_balance)) { 7908 if (time_after_eq(jiffies, rq->next_balance)) {
7849 raw_spin_lock_irq(&rq->lock); 7909 raw_spin_lock_irq(&rq->lock);
7850 update_rq_clock(rq); 7910 update_rq_clock(rq);
7851 update_idle_cpu_load(rq); 7911 update_cpu_load_idle(rq);
7852 raw_spin_unlock_irq(&rq->lock); 7912 raw_spin_unlock_irq(&rq->lock);
7853 rebalance_domains(rq, CPU_IDLE); 7913 rebalance_domains(rq, CPU_IDLE);
7854 } 7914 }
@@ -8234,11 +8294,8 @@ void free_fair_sched_group(struct task_group *tg)
8234 for_each_possible_cpu(i) { 8294 for_each_possible_cpu(i) {
8235 if (tg->cfs_rq) 8295 if (tg->cfs_rq)
8236 kfree(tg->cfs_rq[i]); 8296 kfree(tg->cfs_rq[i]);
8237 if (tg->se) { 8297 if (tg->se)
8238 if (tg->se[i])
8239 remove_entity_load_avg(tg->se[i]);
8240 kfree(tg->se[i]); 8298 kfree(tg->se[i]);
8241 }
8242 } 8299 }
8243 8300
8244 kfree(tg->cfs_rq); 8301 kfree(tg->cfs_rq);
@@ -8286,21 +8343,29 @@ err:
8286 return 0; 8343 return 0;
8287} 8344}
8288 8345
8289void unregister_fair_sched_group(struct task_group *tg, int cpu) 8346void unregister_fair_sched_group(struct task_group *tg)
8290{ 8347{
8291 struct rq *rq = cpu_rq(cpu);
8292 unsigned long flags; 8348 unsigned long flags;
8349 struct rq *rq;
8350 int cpu;
8293 8351
8294 /* 8352 for_each_possible_cpu(cpu) {
8295 * Only empty task groups can be destroyed; so we can speculatively 8353 if (tg->se[cpu])
8296 * check on_list without danger of it being re-added. 8354 remove_entity_load_avg(tg->se[cpu]);
8297 */
8298 if (!tg->cfs_rq[cpu]->on_list)
8299 return;
8300 8355
8301 raw_spin_lock_irqsave(&rq->lock, flags); 8356 /*
8302 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); 8357 * Only empty task groups can be destroyed; so we can speculatively
8303 raw_spin_unlock_irqrestore(&rq->lock, flags); 8358 * check on_list without danger of it being re-added.
8359 */
8360 if (!tg->cfs_rq[cpu]->on_list)
8361 continue;
8362
8363 rq = cpu_rq(cpu);
8364
8365 raw_spin_lock_irqsave(&rq->lock, flags);
8366 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8367 raw_spin_unlock_irqrestore(&rq->lock, flags);
8368 }
8304} 8369}
8305 8370
8306void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 8371void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
@@ -8382,7 +8447,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8382 return 1; 8447 return 1;
8383} 8448}
8384 8449
8385void unregister_fair_sched_group(struct task_group *tg, int cpu) { } 8450void unregister_fair_sched_group(struct task_group *tg) { }
8386 8451
8387#endif /* CONFIG_FAIR_GROUP_SCHED */ 8452#endif /* CONFIG_FAIR_GROUP_SCHED */
8388 8453
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8ec86abe0ea1..a774b4dbf291 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
58 raw_spin_lock(&rt_b->rt_runtime_lock); 58 raw_spin_lock(&rt_b->rt_runtime_lock);
59 if (!rt_b->rt_period_active) { 59 if (!rt_b->rt_period_active) {
60 rt_b->rt_period_active = 1; 60 rt_b->rt_period_active = 1;
61 hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); 61 /*
62 * SCHED_DEADLINE updates the bandwidth, as a run away
63 * RT task with a DL task could hog a CPU. But DL does
64 * not reset the period. If a deadline task was running
65 * without an RT task running, it can cause RT tasks to
66 * throttle when they start up. Kick the timer right away
67 * to update the period.
68 */
69 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
62 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); 70 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
63 } 71 }
64 raw_spin_unlock(&rt_b->rt_runtime_lock); 72 raw_spin_unlock(&rt_b->rt_runtime_lock);
@@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
436 444
437static inline int on_rt_rq(struct sched_rt_entity *rt_se) 445static inline int on_rt_rq(struct sched_rt_entity *rt_se)
438{ 446{
439 return !list_empty(&rt_se->run_list); 447 return rt_se->on_rq;
440} 448}
441 449
442#ifdef CONFIG_RT_GROUP_SCHED 450#ifdef CONFIG_RT_GROUP_SCHED
@@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
482 return rt_se->my_q; 490 return rt_se->my_q;
483} 491}
484 492
485static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); 493static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
486static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 494static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
487 495
488static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 496static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
489{ 497{
@@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
499 if (!rt_se) 507 if (!rt_se)
500 enqueue_top_rt_rq(rt_rq); 508 enqueue_top_rt_rq(rt_rq);
501 else if (!on_rt_rq(rt_se)) 509 else if (!on_rt_rq(rt_se))
502 enqueue_rt_entity(rt_se, false); 510 enqueue_rt_entity(rt_se, 0);
503 511
504 if (rt_rq->highest_prio.curr < curr->prio) 512 if (rt_rq->highest_prio.curr < curr->prio)
505 resched_curr(rq); 513 resched_curr(rq);
@@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
516 if (!rt_se) 524 if (!rt_se)
517 dequeue_top_rt_rq(rt_rq); 525 dequeue_top_rt_rq(rt_rq);
518 else if (on_rt_rq(rt_se)) 526 else if (on_rt_rq(rt_se))
519 dequeue_rt_entity(rt_se); 527 dequeue_rt_entity(rt_se, 0);
520} 528}
521 529
522static inline int rt_rq_throttled(struct rt_rq *rt_rq) 530static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -1166,7 +1174,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1166 dec_rt_group(rt_se, rt_rq); 1174 dec_rt_group(rt_se, rt_rq);
1167} 1175}
1168 1176
1169static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1177/*
1178 * Change rt_se->run_list location unless SAVE && !MOVE
1179 *
1180 * assumes ENQUEUE/DEQUEUE flags match
1181 */
1182static inline bool move_entity(unsigned int flags)
1183{
1184 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1185 return false;
1186
1187 return true;
1188}
1189
1190static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1191{
1192 list_del_init(&rt_se->run_list);
1193
1194 if (list_empty(array->queue + rt_se_prio(rt_se)))
1195 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1196
1197 rt_se->on_list = 0;
1198}
1199
1200static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1170{ 1201{
1171 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1202 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1172 struct rt_prio_array *array = &rt_rq->active; 1203 struct rt_prio_array *array = &rt_rq->active;
@@ -1179,26 +1210,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1179 * get throttled and the current group doesn't have any other 1210 * get throttled and the current group doesn't have any other
1180 * active members. 1211 * active members.
1181 */ 1212 */
1182 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 1213 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1214 if (rt_se->on_list)
1215 __delist_rt_entity(rt_se, array);
1183 return; 1216 return;
1217 }
1184 1218
1185 if (head) 1219 if (move_entity(flags)) {
1186 list_add(&rt_se->run_list, queue); 1220 WARN_ON_ONCE(rt_se->on_list);
1187 else 1221 if (flags & ENQUEUE_HEAD)
1188 list_add_tail(&rt_se->run_list, queue); 1222 list_add(&rt_se->run_list, queue);
1189 __set_bit(rt_se_prio(rt_se), array->bitmap); 1223 else
1224 list_add_tail(&rt_se->run_list, queue);
1225
1226 __set_bit(rt_se_prio(rt_se), array->bitmap);
1227 rt_se->on_list = 1;
1228 }
1229 rt_se->on_rq = 1;
1190 1230
1191 inc_rt_tasks(rt_se, rt_rq); 1231 inc_rt_tasks(rt_se, rt_rq);
1192} 1232}
1193 1233
1194static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) 1234static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1195{ 1235{
1196 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1236 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1197 struct rt_prio_array *array = &rt_rq->active; 1237 struct rt_prio_array *array = &rt_rq->active;
1198 1238
1199 list_del_init(&rt_se->run_list); 1239 if (move_entity(flags)) {
1200 if (list_empty(array->queue + rt_se_prio(rt_se))) 1240 WARN_ON_ONCE(!rt_se->on_list);
1201 __clear_bit(rt_se_prio(rt_se), array->bitmap); 1241 __delist_rt_entity(rt_se, array);
1242 }
1243 rt_se->on_rq = 0;
1202 1244
1203 dec_rt_tasks(rt_se, rt_rq); 1245 dec_rt_tasks(rt_se, rt_rq);
1204} 1246}
@@ -1207,7 +1249,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
1207 * Because the prio of an upper entry depends on the lower 1249 * Because the prio of an upper entry depends on the lower
1208 * entries, we must remove entries top - down. 1250 * entries, we must remove entries top - down.
1209 */ 1251 */
1210static void dequeue_rt_stack(struct sched_rt_entity *rt_se) 1252static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1211{ 1253{
1212 struct sched_rt_entity *back = NULL; 1254 struct sched_rt_entity *back = NULL;
1213 1255
@@ -1220,31 +1262,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
1220 1262
1221 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1263 for (rt_se = back; rt_se; rt_se = rt_se->back) {
1222 if (on_rt_rq(rt_se)) 1264 if (on_rt_rq(rt_se))
1223 __dequeue_rt_entity(rt_se); 1265 __dequeue_rt_entity(rt_se, flags);
1224 } 1266 }
1225} 1267}
1226 1268
1227static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1269static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1228{ 1270{
1229 struct rq *rq = rq_of_rt_se(rt_se); 1271 struct rq *rq = rq_of_rt_se(rt_se);
1230 1272
1231 dequeue_rt_stack(rt_se); 1273 dequeue_rt_stack(rt_se, flags);
1232 for_each_sched_rt_entity(rt_se) 1274 for_each_sched_rt_entity(rt_se)
1233 __enqueue_rt_entity(rt_se, head); 1275 __enqueue_rt_entity(rt_se, flags);
1234 enqueue_top_rt_rq(&rq->rt); 1276 enqueue_top_rt_rq(&rq->rt);
1235} 1277}
1236 1278
1237static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 1279static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1238{ 1280{
1239 struct rq *rq = rq_of_rt_se(rt_se); 1281 struct rq *rq = rq_of_rt_se(rt_se);
1240 1282
1241 dequeue_rt_stack(rt_se); 1283 dequeue_rt_stack(rt_se, flags);
1242 1284
1243 for_each_sched_rt_entity(rt_se) { 1285 for_each_sched_rt_entity(rt_se) {
1244 struct rt_rq *rt_rq = group_rt_rq(rt_se); 1286 struct rt_rq *rt_rq = group_rt_rq(rt_se);
1245 1287
1246 if (rt_rq && rt_rq->rt_nr_running) 1288 if (rt_rq && rt_rq->rt_nr_running)
1247 __enqueue_rt_entity(rt_se, false); 1289 __enqueue_rt_entity(rt_se, flags);
1248 } 1290 }
1249 enqueue_top_rt_rq(&rq->rt); 1291 enqueue_top_rt_rq(&rq->rt);
1250} 1292}
@@ -1260,7 +1302,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1260 if (flags & ENQUEUE_WAKEUP) 1302 if (flags & ENQUEUE_WAKEUP)
1261 rt_se->timeout = 0; 1303 rt_se->timeout = 0;
1262 1304
1263 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); 1305 enqueue_rt_entity(rt_se, flags);
1264 1306
1265 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1307 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1266 enqueue_pushable_task(rq, p); 1308 enqueue_pushable_task(rq, p);
@@ -1271,7 +1313,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1271 struct sched_rt_entity *rt_se = &p->rt; 1313 struct sched_rt_entity *rt_se = &p->rt;
1272 1314
1273 update_curr_rt(rq); 1315 update_curr_rt(rq);
1274 dequeue_rt_entity(rt_se); 1316 dequeue_rt_entity(rt_se, flags);
1275 1317
1276 dequeue_pushable_task(rq, p); 1318 dequeue_pushable_task(rq, p);
1277} 1319}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10f16374df7f..ef5875fff5b7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3,6 +3,7 @@
3#include <linux/sched/sysctl.h> 3#include <linux/sched/sysctl.h>
4#include <linux/sched/rt.h> 4#include <linux/sched/rt.h>
5#include <linux/sched/deadline.h> 5#include <linux/sched/deadline.h>
6#include <linux/binfmts.h>
6#include <linux/mutex.h> 7#include <linux/mutex.h>
7#include <linux/spinlock.h> 8#include <linux/spinlock.h>
8#include <linux/stop_machine.h> 9#include <linux/stop_machine.h>
@@ -313,12 +314,11 @@ extern int tg_nop(struct task_group *tg, void *data);
313 314
314extern void free_fair_sched_group(struct task_group *tg); 315extern void free_fair_sched_group(struct task_group *tg);
315extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); 316extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
316extern void unregister_fair_sched_group(struct task_group *tg, int cpu); 317extern void unregister_fair_sched_group(struct task_group *tg);
317extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 318extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
318 struct sched_entity *se, int cpu, 319 struct sched_entity *se, int cpu,
319 struct sched_entity *parent); 320 struct sched_entity *parent);
320extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); 321extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
321extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
322 322
323extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); 323extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
324extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); 324extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
@@ -909,6 +909,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
909 909
910extern int group_balance_cpu(struct sched_group *sg); 910extern int group_balance_cpu(struct sched_group *sg);
911 911
912#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
913void register_sched_domain_sysctl(void);
914void unregister_sched_domain_sysctl(void);
915#else
916static inline void register_sched_domain_sysctl(void)
917{
918}
919static inline void unregister_sched_domain_sysctl(void)
920{
921}
922#endif
923
912#else 924#else
913 925
914static inline void sched_ttwu_pending(void) { } 926static inline void sched_ttwu_pending(void) { }
@@ -1022,6 +1034,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
1022#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ 1034#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
1023 1035
1024extern struct static_key_false sched_numa_balancing; 1036extern struct static_key_false sched_numa_balancing;
1037extern struct static_key_false sched_schedstats;
1025 1038
1026static inline u64 global_rt_period(void) 1039static inline u64 global_rt_period(void)
1027{ 1040{
@@ -1130,18 +1143,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1130extern const int sched_prio_to_weight[40]; 1143extern const int sched_prio_to_weight[40];
1131extern const u32 sched_prio_to_wmult[40]; 1144extern const u32 sched_prio_to_wmult[40];
1132 1145
1146/*
1147 * {de,en}queue flags:
1148 *
1149 * DEQUEUE_SLEEP - task is no longer runnable
1150 * ENQUEUE_WAKEUP - task just became runnable
1151 *
1152 * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
1153 * are in a known state which allows modification. Such pairs
1154 * should preserve as much state as possible.
1155 *
1156 * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
1157 * in the runqueue.
1158 *
1159 * ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
1160 * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
1161 * ENQUEUE_WAKING - sched_class::task_waking was called
1162 *
1163 */
1164
1165#define DEQUEUE_SLEEP 0x01
1166#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
1167#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
1168
1133#define ENQUEUE_WAKEUP 0x01 1169#define ENQUEUE_WAKEUP 0x01
1134#define ENQUEUE_HEAD 0x02 1170#define ENQUEUE_RESTORE 0x02
1171#define ENQUEUE_MOVE 0x04
1172
1173#define ENQUEUE_HEAD 0x08
1174#define ENQUEUE_REPLENISH 0x10
1135#ifdef CONFIG_SMP 1175#ifdef CONFIG_SMP
1136#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ 1176#define ENQUEUE_WAKING 0x20
1137#else 1177#else
1138#define ENQUEUE_WAKING 0x00 1178#define ENQUEUE_WAKING 0x00
1139#endif 1179#endif
1140#define ENQUEUE_REPLENISH 0x08
1141#define ENQUEUE_RESTORE 0x10
1142
1143#define DEQUEUE_SLEEP 0x01
1144#define DEQUEUE_SAVE 0x02
1145 1180
1146#define RETRY_TASK ((void *)-1UL) 1181#define RETRY_TASK ((void *)-1UL)
1147 1182
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index b0fbc7632de5..70b3b6a20fb0 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
29 if (rq) 29 if (rq)
30 rq->rq_sched_info.run_delay += delta; 30 rq->rq_sched_info.run_delay += delta;
31} 31}
32# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) 32# define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
33# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) 33# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
34# define schedstat_set(var, val) do { var = (val); } while (0) 34# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
35# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
35#else /* !CONFIG_SCHEDSTATS */ 36#else /* !CONFIG_SCHEDSTATS */
36static inline void 37static inline void
37rq_sched_info_arrive(struct rq *rq, unsigned long long delta) 38rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
42static inline void 43static inline void
43rq_sched_info_depart(struct rq *rq, unsigned long long delta) 44rq_sched_info_depart(struct rq *rq, unsigned long long delta)
44{} 45{}
46# define schedstat_enabled() 0
45# define schedstat_inc(rq, field) do { } while (0) 47# define schedstat_inc(rq, field) do { } while (0)
46# define schedstat_add(rq, field, amt) do { } while (0) 48# define schedstat_add(rq, field, amt) do { } while (0)
47# define schedstat_set(var, val) do { } while (0) 49# define schedstat_set(var, val) do { } while (0)
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
new file mode 100644
index 000000000000..82f0dff90030
--- /dev/null
+++ b/kernel/sched/swait.c
@@ -0,0 +1,123 @@
1#include <linux/sched.h>
2#include <linux/swait.h>
3
4void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
5 struct lock_class_key *key)
6{
7 raw_spin_lock_init(&q->lock);
8 lockdep_set_class_and_name(&q->lock, key, name);
9 INIT_LIST_HEAD(&q->task_list);
10}
11EXPORT_SYMBOL(__init_swait_queue_head);
12
13/*
14 * The thing about the wake_up_state() return value; I think we can ignore it.
15 *
16 * If for some reason it would return 0, that means the previously waiting
17 * task is already running, so it will observe condition true (or has already).
18 */
19void swake_up_locked(struct swait_queue_head *q)
20{
21 struct swait_queue *curr;
22
23 if (list_empty(&q->task_list))
24 return;
25
26 curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
27 wake_up_process(curr->task);
28 list_del_init(&curr->task_list);
29}
30EXPORT_SYMBOL(swake_up_locked);
31
32void swake_up(struct swait_queue_head *q)
33{
34 unsigned long flags;
35
36 if (!swait_active(q))
37 return;
38
39 raw_spin_lock_irqsave(&q->lock, flags);
40 swake_up_locked(q);
41 raw_spin_unlock_irqrestore(&q->lock, flags);
42}
43EXPORT_SYMBOL(swake_up);
44
45/*
46 * Does not allow usage from IRQ disabled, since we must be able to
47 * release IRQs to guarantee bounded hold time.
48 */
49void swake_up_all(struct swait_queue_head *q)
50{
51 struct swait_queue *curr;
52 LIST_HEAD(tmp);
53
54 if (!swait_active(q))
55 return;
56
57 raw_spin_lock_irq(&q->lock);
58 list_splice_init(&q->task_list, &tmp);
59 while (!list_empty(&tmp)) {
60 curr = list_first_entry(&tmp, typeof(*curr), task_list);
61
62 wake_up_state(curr->task, TASK_NORMAL);
63 list_del_init(&curr->task_list);
64
65 if (list_empty(&tmp))
66 break;
67
68 raw_spin_unlock_irq(&q->lock);
69 raw_spin_lock_irq(&q->lock);
70 }
71 raw_spin_unlock_irq(&q->lock);
72}
73EXPORT_SYMBOL(swake_up_all);
74
75void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
76{
77 wait->task = current;
78 if (list_empty(&wait->task_list))
79 list_add(&wait->task_list, &q->task_list);
80}
81
82void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
83{
84 unsigned long flags;
85
86 raw_spin_lock_irqsave(&q->lock, flags);
87 __prepare_to_swait(q, wait);
88 set_current_state(state);
89 raw_spin_unlock_irqrestore(&q->lock, flags);
90}
91EXPORT_SYMBOL(prepare_to_swait);
92
93long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
94{
95 if (signal_pending_state(state, current))
96 return -ERESTARTSYS;
97
98 prepare_to_swait(q, wait, state);
99
100 return 0;
101}
102EXPORT_SYMBOL(prepare_to_swait_event);
103
104void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
105{
106 __set_current_state(TASK_RUNNING);
107 if (!list_empty(&wait->task_list))
108 list_del_init(&wait->task_list);
109}
110
111void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
112{
113 unsigned long flags;
114
115 __set_current_state(TASK_RUNNING);
116
117 if (!list_empty_careful(&wait->task_list)) {
118 raw_spin_lock_irqsave(&q->lock, flags);
119 list_del_init(&wait->task_list);
120 raw_spin_unlock_irqrestore(&q->lock, flags);
121 }
122}
123EXPORT_SYMBOL(finish_swait);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 479e4436f787..8aae49dd7da8 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
116 116
117 if (preempt_count() == cnt) { 117 if (preempt_count() == cnt) {
118#ifdef CONFIG_DEBUG_PREEMPT 118#ifdef CONFIG_DEBUG_PREEMPT
119 current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1); 119 current->preempt_disable_ip = get_lock_parent_ip();
120#endif 120#endif
121 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 121 trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
122 } 122 }
123} 123}
124EXPORT_SYMBOL(__local_bh_disable_ip); 124EXPORT_SYMBOL(__local_bh_disable_ip);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 97715fd9e790..f5102fabef7f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = {
350 .mode = 0644, 350 .mode = 0644,
351 .proc_handler = proc_dointvec, 351 .proc_handler = proc_dointvec,
352 }, 352 },
353#ifdef CONFIG_SCHEDSTATS
354 {
355 .procname = "sched_schedstats",
356 .data = NULL,
357 .maxlen = sizeof(unsigned int),
358 .mode = 0644,
359 .proc_handler = sysctl_schedstats,
360 .extra1 = &zero,
361 .extra2 = &one,
362 },
363#endif /* CONFIG_SCHEDSTATS */
353#endif /* CONFIG_SMP */ 364#endif /* CONFIG_SMP */
354#ifdef CONFIG_NUMA_BALANCING 365#ifdef CONFIG_NUMA_BALANCING
355 { 366 {
@@ -505,7 +516,7 @@ static struct ctl_table kern_table[] = {
505 .data = &latencytop_enabled, 516 .data = &latencytop_enabled,
506 .maxlen = sizeof(int), 517 .maxlen = sizeof(int),
507 .mode = 0644, 518 .mode = 0644,
508 .proc_handler = proc_dointvec, 519 .proc_handler = sysctl_latencytop,
509 }, 520 },
510#endif 521#endif
511#ifdef CONFIG_BLK_DEV_INITRD 522#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 975cb49e32bf..f8e26ab963ed 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
93{ 93{
94 struct mm_struct *mm; 94 struct mm_struct *mm;
95 95
96 /* convert pages-usec to Mbyte-usec */ 96 /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */
97 stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB; 97 stats->coremem = p->acct_rss_mem1 * PAGE_SIZE;
98 stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB; 98 do_div(stats->coremem, 1000 * KB);
99 stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE;
100 do_div(stats->virtmem, 1000 * KB);
99 mm = get_task_mm(p); 101 mm = get_task_mm(p);
100 if (mm) { 102 if (mm) {
101 /* adjust to KB unit */ 103 /* adjust to KB unit */
@@ -123,27 +125,28 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
123static void __acct_update_integrals(struct task_struct *tsk, 125static void __acct_update_integrals(struct task_struct *tsk,
124 cputime_t utime, cputime_t stime) 126 cputime_t utime, cputime_t stime)
125{ 127{
126 if (likely(tsk->mm)) { 128 cputime_t time, dtime;
127 cputime_t time, dtime; 129 u64 delta;
128 struct timeval value; 130
129 unsigned long flags; 131 if (!likely(tsk->mm))
130 u64 delta; 132 return;
131 133
132 local_irq_save(flags); 134 time = stime + utime;
133 time = stime + utime; 135 dtime = time - tsk->acct_timexpd;
134 dtime = time - tsk->acct_timexpd; 136 /* Avoid division: cputime_t is often in nanoseconds already. */
135 jiffies_to_timeval(cputime_to_jiffies(dtime), &value); 137 delta = cputime_to_nsecs(dtime);
136 delta = value.tv_sec; 138
137 delta = delta * USEC_PER_SEC + value.tv_usec; 139 if (delta < TICK_NSEC)
138 140 return;
139 if (delta == 0) 141
140 goto out; 142 tsk->acct_timexpd = time;
141 tsk->acct_timexpd = time; 143 /*
142 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); 144 * Divide by 1024 to avoid overflow, and to avoid division.
143 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; 145 * The final unit reported to userspace is Mbyte-usecs,
144 out: 146 * the rest of the math is done in xacct_add_tsk.
145 local_irq_restore(flags); 147 */
146 } 148 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
149 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
147} 150}
148 151
149/** 152/**
@@ -153,9 +156,12 @@ static void __acct_update_integrals(struct task_struct *tsk,
153void acct_update_integrals(struct task_struct *tsk) 156void acct_update_integrals(struct task_struct *tsk)
154{ 157{
155 cputime_t utime, stime; 158 cputime_t utime, stime;
159 unsigned long flags;
156 160
161 local_irq_save(flags);
157 task_cputime(tsk, &utime, &stime); 162 task_cputime(tsk, &utime, &stime);
158 __acct_update_integrals(tsk, utime, stime); 163 __acct_update_integrals(tsk, utime, stime);
164 local_irq_restore(flags);
159} 165}
160 166
161/** 167/**
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index db2dd3335c6a..65da997b430a 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -97,8 +97,8 @@ static void async_pf_execute(struct work_struct *work)
97 * This memory barrier pairs with prepare_to_wait's set_current_state() 97 * This memory barrier pairs with prepare_to_wait's set_current_state()
98 */ 98 */
99 smp_mb(); 99 smp_mb();
100 if (waitqueue_active(&vcpu->wq)) 100 if (swait_active(&vcpu->wq))
101 wake_up_interruptible(&vcpu->wq); 101 swake_up(&vcpu->wq);
102 102
103 mmput(mm); 103 mmput(mm);
104 kvm_put_kvm(vcpu->kvm); 104 kvm_put_kvm(vcpu->kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9102ae172d2a..5af50c3ddd53 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -216,8 +216,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
216 vcpu->kvm = kvm; 216 vcpu->kvm = kvm;
217 vcpu->vcpu_id = id; 217 vcpu->vcpu_id = id;
218 vcpu->pid = NULL; 218 vcpu->pid = NULL;
219 vcpu->halt_poll_ns = 0; 219 init_swait_queue_head(&vcpu->wq);
220 init_waitqueue_head(&vcpu->wq);
221 kvm_async_pf_vcpu_init(vcpu); 220 kvm_async_pf_vcpu_init(vcpu);
222 221
223 vcpu->pre_pcpu = -1; 222 vcpu->pre_pcpu = -1;
@@ -1993,7 +1992,7 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
1993void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1992void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1994{ 1993{
1995 ktime_t start, cur; 1994 ktime_t start, cur;
1996 DEFINE_WAIT(wait); 1995 DECLARE_SWAITQUEUE(wait);
1997 bool waited = false; 1996 bool waited = false;
1998 u64 block_ns; 1997 u64 block_ns;
1999 1998
@@ -2018,7 +2017,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
2018 kvm_arch_vcpu_blocking(vcpu); 2017 kvm_arch_vcpu_blocking(vcpu);
2019 2018
2020 for (;;) { 2019 for (;;) {
2021 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 2020 prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
2022 2021
2023 if (kvm_vcpu_check_block(vcpu) < 0) 2022 if (kvm_vcpu_check_block(vcpu) < 0)
2024 break; 2023 break;
@@ -2027,7 +2026,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
2027 schedule(); 2026 schedule();
2028 } 2027 }
2029 2028
2030 finish_wait(&vcpu->wq, &wait); 2029 finish_swait(&vcpu->wq, &wait);
2031 cur = ktime_get(); 2030 cur = ktime_get();
2032 2031
2033 kvm_arch_vcpu_unblocking(vcpu); 2032 kvm_arch_vcpu_unblocking(vcpu);
@@ -2059,11 +2058,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
2059{ 2058{
2060 int me; 2059 int me;
2061 int cpu = vcpu->cpu; 2060 int cpu = vcpu->cpu;
2062 wait_queue_head_t *wqp; 2061 struct swait_queue_head *wqp;
2063 2062
2064 wqp = kvm_arch_vcpu_wq(vcpu); 2063 wqp = kvm_arch_vcpu_wq(vcpu);
2065 if (waitqueue_active(wqp)) { 2064 if (swait_active(wqp)) {
2066 wake_up_interruptible(wqp); 2065 swake_up(wqp);
2067 ++vcpu->stat.halt_wakeup; 2066 ++vcpu->stat.halt_wakeup;
2068 } 2067 }
2069 2068
@@ -2164,7 +2163,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
2164 continue; 2163 continue;
2165 if (vcpu == me) 2164 if (vcpu == me)
2166 continue; 2165 continue;
2167 if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) 2166 if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
2168 continue; 2167 continue;
2169 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 2168 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
2170 continue; 2169 continue;