diff options
37 files changed, 1300 insertions, 715 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 000336733a6a..8ae47a7b4923 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
| @@ -3532,6 +3532,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
| 3532 | 3532 | ||
| 3533 | sched_debug [KNL] Enables verbose scheduler debug messages. | 3533 | sched_debug [KNL] Enables verbose scheduler debug messages. |
| 3534 | 3534 | ||
| 3535 | schedstats= [KNL,X86] Enable or disable scheduled statistics. | ||
| 3536 | Allowed values are enable and disable. This feature | ||
| 3537 | incurs a small amount of overhead in the scheduler | ||
| 3538 | but is useful for debugging and performance tuning. | ||
| 3539 | |||
| 3535 | skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate | 3540 | skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate |
| 3536 | xtime_lock contention on larger systems, and/or RCU lock | 3541 | xtime_lock contention on larger systems, and/or RCU lock |
| 3537 | contention on all systems with CONFIG_MAXSMP set. | 3542 | contention on all systems with CONFIG_MAXSMP set. |
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index f886fbb1ad05..f4444c94ff28 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
| @@ -773,6 +773,14 @@ rtsig-nr shows the number of RT signals currently queued. | |||
| 773 | 773 | ||
| 774 | ============================================================== | 774 | ============================================================== |
| 775 | 775 | ||
| 776 | sched_schedstats: | ||
| 777 | |||
| 778 | Enables/disables scheduler statistics. Enabling this feature | ||
| 779 | incurs a small amount of overhead in the scheduler but is | ||
| 780 | useful for debugging and performance tuning. | ||
| 781 | |||
| 782 | ============================================================== | ||
| 783 | |||
| 776 | sg-big-buff: | 784 | sg-big-buff: |
| 777 | 785 | ||
| 778 | This file shows the size of the generic SCSI (sg) buffer. | 786 | This file shows the size of the generic SCSI (sg) buffer. |
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index dda1959f0dde..08e49c423c24 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c | |||
| @@ -506,18 +506,18 @@ static void kvm_arm_resume_guest(struct kvm *kvm) | |||
| 506 | struct kvm_vcpu *vcpu; | 506 | struct kvm_vcpu *vcpu; |
| 507 | 507 | ||
| 508 | kvm_for_each_vcpu(i, vcpu, kvm) { | 508 | kvm_for_each_vcpu(i, vcpu, kvm) { |
| 509 | wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); | 509 | struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); |
| 510 | 510 | ||
| 511 | vcpu->arch.pause = false; | 511 | vcpu->arch.pause = false; |
| 512 | wake_up_interruptible(wq); | 512 | swake_up(wq); |
| 513 | } | 513 | } |
| 514 | } | 514 | } |
| 515 | 515 | ||
| 516 | static void vcpu_sleep(struct kvm_vcpu *vcpu) | 516 | static void vcpu_sleep(struct kvm_vcpu *vcpu) |
| 517 | { | 517 | { |
| 518 | wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); | 518 | struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); |
| 519 | 519 | ||
| 520 | wait_event_interruptible(*wq, ((!vcpu->arch.power_off) && | 520 | swait_event_interruptible(*wq, ((!vcpu->arch.power_off) && |
| 521 | (!vcpu->arch.pause))); | 521 | (!vcpu->arch.pause))); |
| 522 | } | 522 | } |
| 523 | 523 | ||
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c index a9b3b905e661..c2b131527a64 100644 --- a/arch/arm/kvm/psci.c +++ b/arch/arm/kvm/psci.c | |||
| @@ -70,7 +70,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) | |||
| 70 | { | 70 | { |
| 71 | struct kvm *kvm = source_vcpu->kvm; | 71 | struct kvm *kvm = source_vcpu->kvm; |
| 72 | struct kvm_vcpu *vcpu = NULL; | 72 | struct kvm_vcpu *vcpu = NULL; |
| 73 | wait_queue_head_t *wq; | 73 | struct swait_queue_head *wq; |
| 74 | unsigned long cpu_id; | 74 | unsigned long cpu_id; |
| 75 | unsigned long context_id; | 75 | unsigned long context_id; |
| 76 | phys_addr_t target_pc; | 76 | phys_addr_t target_pc; |
| @@ -119,7 +119,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) | |||
| 119 | smp_mb(); /* Make sure the above is visible */ | 119 | smp_mb(); /* Make sure the above is visible */ |
| 120 | 120 | ||
| 121 | wq = kvm_arch_vcpu_wq(vcpu); | 121 | wq = kvm_arch_vcpu_wq(vcpu); |
| 122 | wake_up_interruptible(wq); | 122 | swake_up(wq); |
| 123 | 123 | ||
| 124 | return PSCI_RET_SUCCESS; | 124 | return PSCI_RET_SUCCESS; |
| 125 | } | 125 | } |
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 3110447ab1e9..70ef1a43c114 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c | |||
| @@ -445,8 +445,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, | |||
| 445 | 445 | ||
| 446 | dvcpu->arch.wait = 0; | 446 | dvcpu->arch.wait = 0; |
| 447 | 447 | ||
| 448 | if (waitqueue_active(&dvcpu->wq)) | 448 | if (swait_active(&dvcpu->wq)) |
| 449 | wake_up_interruptible(&dvcpu->wq); | 449 | swake_up(&dvcpu->wq); |
| 450 | 450 | ||
| 451 | return 0; | 451 | return 0; |
| 452 | } | 452 | } |
| @@ -1174,8 +1174,8 @@ static void kvm_mips_comparecount_func(unsigned long data) | |||
| 1174 | kvm_mips_callbacks->queue_timer_int(vcpu); | 1174 | kvm_mips_callbacks->queue_timer_int(vcpu); |
| 1175 | 1175 | ||
| 1176 | vcpu->arch.wait = 0; | 1176 | vcpu->arch.wait = 0; |
| 1177 | if (waitqueue_active(&vcpu->wq)) | 1177 | if (swait_active(&vcpu->wq)) |
| 1178 | wake_up_interruptible(&vcpu->wq); | 1178 | swake_up(&vcpu->wq); |
| 1179 | } | 1179 | } |
| 1180 | 1180 | ||
| 1181 | /* low level hrtimer wake routine */ | 1181 | /* low level hrtimer wake routine */ |
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 9d08d8cbed1a..c98afa538b3a 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h | |||
| @@ -289,7 +289,7 @@ struct kvmppc_vcore { | |||
| 289 | struct list_head runnable_threads; | 289 | struct list_head runnable_threads; |
| 290 | struct list_head preempt_list; | 290 | struct list_head preempt_list; |
| 291 | spinlock_t lock; | 291 | spinlock_t lock; |
| 292 | wait_queue_head_t wq; | 292 | struct swait_queue_head wq; |
| 293 | spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */ | 293 | spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */ |
| 294 | u64 stolen_tb; | 294 | u64 stolen_tb; |
| 295 | u64 preempt_tb; | 295 | u64 preempt_tb; |
| @@ -629,7 +629,7 @@ struct kvm_vcpu_arch { | |||
| 629 | u8 prodded; | 629 | u8 prodded; |
| 630 | u32 last_inst; | 630 | u32 last_inst; |
| 631 | 631 | ||
| 632 | wait_queue_head_t *wqp; | 632 | struct swait_queue_head *wqp; |
| 633 | struct kvmppc_vcore *vcore; | 633 | struct kvmppc_vcore *vcore; |
| 634 | int ret; | 634 | int ret; |
| 635 | int trap; | 635 | int trap; |
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index baeddb06811d..f1187bb6dd4d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c | |||
| @@ -114,11 +114,11 @@ static bool kvmppc_ipi_thread(int cpu) | |||
| 114 | static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) | 114 | static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) |
| 115 | { | 115 | { |
| 116 | int cpu; | 116 | int cpu; |
| 117 | wait_queue_head_t *wqp; | 117 | struct swait_queue_head *wqp; |
| 118 | 118 | ||
| 119 | wqp = kvm_arch_vcpu_wq(vcpu); | 119 | wqp = kvm_arch_vcpu_wq(vcpu); |
| 120 | if (waitqueue_active(wqp)) { | 120 | if (swait_active(wqp)) { |
| 121 | wake_up_interruptible(wqp); | 121 | swake_up(wqp); |
| 122 | ++vcpu->stat.halt_wakeup; | 122 | ++vcpu->stat.halt_wakeup; |
| 123 | } | 123 | } |
| 124 | 124 | ||
| @@ -701,8 +701,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) | |||
| 701 | tvcpu->arch.prodded = 1; | 701 | tvcpu->arch.prodded = 1; |
| 702 | smp_mb(); | 702 | smp_mb(); |
| 703 | if (vcpu->arch.ceded) { | 703 | if (vcpu->arch.ceded) { |
| 704 | if (waitqueue_active(&vcpu->wq)) { | 704 | if (swait_active(&vcpu->wq)) { |
| 705 | wake_up_interruptible(&vcpu->wq); | 705 | swake_up(&vcpu->wq); |
| 706 | vcpu->stat.halt_wakeup++; | 706 | vcpu->stat.halt_wakeup++; |
| 707 | } | 707 | } |
| 708 | } | 708 | } |
| @@ -1459,7 +1459,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) | |||
| 1459 | INIT_LIST_HEAD(&vcore->runnable_threads); | 1459 | INIT_LIST_HEAD(&vcore->runnable_threads); |
| 1460 | spin_lock_init(&vcore->lock); | 1460 | spin_lock_init(&vcore->lock); |
| 1461 | spin_lock_init(&vcore->stoltb_lock); | 1461 | spin_lock_init(&vcore->stoltb_lock); |
| 1462 | init_waitqueue_head(&vcore->wq); | 1462 | init_swait_queue_head(&vcore->wq); |
| 1463 | vcore->preempt_tb = TB_NIL; | 1463 | vcore->preempt_tb = TB_NIL; |
| 1464 | vcore->lpcr = kvm->arch.lpcr; | 1464 | vcore->lpcr = kvm->arch.lpcr; |
| 1465 | vcore->first_vcpuid = core * threads_per_subcore; | 1465 | vcore->first_vcpuid = core * threads_per_subcore; |
| @@ -2531,10 +2531,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) | |||
| 2531 | { | 2531 | { |
| 2532 | struct kvm_vcpu *vcpu; | 2532 | struct kvm_vcpu *vcpu; |
| 2533 | int do_sleep = 1; | 2533 | int do_sleep = 1; |
| 2534 | DECLARE_SWAITQUEUE(wait); | ||
| 2534 | 2535 | ||
| 2535 | DEFINE_WAIT(wait); | 2536 | prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); |
| 2536 | |||
| 2537 | prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE); | ||
| 2538 | 2537 | ||
| 2539 | /* | 2538 | /* |
| 2540 | * Check one last time for pending exceptions and ceded state after | 2539 | * Check one last time for pending exceptions and ceded state after |
| @@ -2548,7 +2547,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) | |||
| 2548 | } | 2547 | } |
| 2549 | 2548 | ||
| 2550 | if (!do_sleep) { | 2549 | if (!do_sleep) { |
| 2551 | finish_wait(&vc->wq, &wait); | 2550 | finish_swait(&vc->wq, &wait); |
| 2552 | return; | 2551 | return; |
| 2553 | } | 2552 | } |
| 2554 | 2553 | ||
| @@ -2556,7 +2555,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) | |||
| 2556 | trace_kvmppc_vcore_blocked(vc, 0); | 2555 | trace_kvmppc_vcore_blocked(vc, 0); |
| 2557 | spin_unlock(&vc->lock); | 2556 | spin_unlock(&vc->lock); |
| 2558 | schedule(); | 2557 | schedule(); |
| 2559 | finish_wait(&vc->wq, &wait); | 2558 | finish_swait(&vc->wq, &wait); |
| 2560 | spin_lock(&vc->lock); | 2559 | spin_lock(&vc->lock); |
| 2561 | vc->vcore_state = VCORE_INACTIVE; | 2560 | vc->vcore_state = VCORE_INACTIVE; |
| 2562 | trace_kvmppc_vcore_blocked(vc, 1); | 2561 | trace_kvmppc_vcore_blocked(vc, 1); |
| @@ -2612,7 +2611,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
| 2612 | kvmppc_start_thread(vcpu, vc); | 2611 | kvmppc_start_thread(vcpu, vc); |
| 2613 | trace_kvm_guest_enter(vcpu); | 2612 | trace_kvm_guest_enter(vcpu); |
| 2614 | } else if (vc->vcore_state == VCORE_SLEEPING) { | 2613 | } else if (vc->vcore_state == VCORE_SLEEPING) { |
| 2615 | wake_up(&vc->wq); | 2614 | swake_up(&vc->wq); |
| 2616 | } | 2615 | } |
| 2617 | 2616 | ||
| 2618 | } | 2617 | } |
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 8959ebb6d2c9..b0c8ad0799c7 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h | |||
| @@ -467,7 +467,7 @@ struct kvm_s390_irq_payload { | |||
| 467 | struct kvm_s390_local_interrupt { | 467 | struct kvm_s390_local_interrupt { |
| 468 | spinlock_t lock; | 468 | spinlock_t lock; |
| 469 | struct kvm_s390_float_interrupt *float_int; | 469 | struct kvm_s390_float_interrupt *float_int; |
| 470 | wait_queue_head_t *wq; | 470 | struct swait_queue_head *wq; |
| 471 | atomic_t *cpuflags; | 471 | atomic_t *cpuflags; |
| 472 | DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS); | 472 | DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS); |
| 473 | struct kvm_s390_irq_payload irq; | 473 | struct kvm_s390_irq_payload irq; |
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f88ca72c3a77..9ffc73221792 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c | |||
| @@ -966,13 +966,13 @@ no_timer: | |||
| 966 | 966 | ||
| 967 | void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) | 967 | void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) |
| 968 | { | 968 | { |
| 969 | if (waitqueue_active(&vcpu->wq)) { | 969 | if (swait_active(&vcpu->wq)) { |
| 970 | /* | 970 | /* |
| 971 | * The vcpu gave up the cpu voluntarily, mark it as a good | 971 | * The vcpu gave up the cpu voluntarily, mark it as a good |
| 972 | * yield-candidate. | 972 | * yield-candidate. |
| 973 | */ | 973 | */ |
| 974 | vcpu->preempted = true; | 974 | vcpu->preempted = true; |
| 975 | wake_up_interruptible(&vcpu->wq); | 975 | swake_up(&vcpu->wq); |
| 976 | vcpu->stat.halt_wakeup++; | 976 | vcpu->stat.halt_wakeup++; |
| 977 | } | 977 | } |
| 978 | } | 978 | } |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 36591faed13b..3a045f39ed81 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
| @@ -1195,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic) | |||
| 1195 | static void apic_timer_expired(struct kvm_lapic *apic) | 1195 | static void apic_timer_expired(struct kvm_lapic *apic) |
| 1196 | { | 1196 | { |
| 1197 | struct kvm_vcpu *vcpu = apic->vcpu; | 1197 | struct kvm_vcpu *vcpu = apic->vcpu; |
| 1198 | wait_queue_head_t *q = &vcpu->wq; | 1198 | struct swait_queue_head *q = &vcpu->wq; |
| 1199 | struct kvm_timer *ktimer = &apic->lapic_timer; | 1199 | struct kvm_timer *ktimer = &apic->lapic_timer; |
| 1200 | 1200 | ||
| 1201 | if (atomic_read(&apic->lapic_timer.pending)) | 1201 | if (atomic_read(&apic->lapic_timer.pending)) |
| @@ -1204,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic) | |||
| 1204 | atomic_inc(&apic->lapic_timer.pending); | 1204 | atomic_inc(&apic->lapic_timer.pending); |
| 1205 | kvm_set_pending_timer(vcpu); | 1205 | kvm_set_pending_timer(vcpu); |
| 1206 | 1206 | ||
| 1207 | if (waitqueue_active(q)) | 1207 | if (swait_active(q)) |
| 1208 | wake_up_interruptible(q); | 1208 | swake_up(q); |
| 1209 | 1209 | ||
| 1210 | if (apic_lvtt_tscdeadline(apic)) | 1210 | if (apic_lvtt_tscdeadline(apic)) |
| 1211 | ktimer->expired_tscdeadline = ktimer->tscdeadline; | 1211 | ktimer->expired_tscdeadline = ktimer->tscdeadline; |
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index c2b340e23f62..6d9df3f7e334 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h | |||
| @@ -713,6 +713,18 @@ static inline void __ftrace_enabled_restore(int enabled) | |||
| 713 | #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5)) | 713 | #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5)) |
| 714 | #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6)) | 714 | #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6)) |
| 715 | 715 | ||
| 716 | static inline unsigned long get_lock_parent_ip(void) | ||
| 717 | { | ||
| 718 | unsigned long addr = CALLER_ADDR0; | ||
| 719 | |||
| 720 | if (!in_lock_functions(addr)) | ||
| 721 | return addr; | ||
| 722 | addr = CALLER_ADDR1; | ||
| 723 | if (!in_lock_functions(addr)) | ||
| 724 | return addr; | ||
| 725 | return CALLER_ADDR2; | ||
| 726 | } | ||
| 727 | |||
| 716 | #ifdef CONFIG_IRQSOFF_TRACER | 728 | #ifdef CONFIG_IRQSOFF_TRACER |
| 717 | extern void time_hardirqs_on(unsigned long a0, unsigned long a1); | 729 | extern void time_hardirqs_on(unsigned long a0, unsigned long a1); |
| 718 | extern void time_hardirqs_off(unsigned long a0, unsigned long a1); | 730 | extern void time_hardirqs_off(unsigned long a0, unsigned long a1); |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 861f690aa791..5276fe0916fc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | #include <linux/irqflags.h> | 25 | #include <linux/irqflags.h> |
| 26 | #include <linux/context_tracking.h> | 26 | #include <linux/context_tracking.h> |
| 27 | #include <linux/irqbypass.h> | 27 | #include <linux/irqbypass.h> |
| 28 | #include <linux/swait.h> | ||
| 28 | #include <asm/signal.h> | 29 | #include <asm/signal.h> |
| 29 | 30 | ||
| 30 | #include <linux/kvm.h> | 31 | #include <linux/kvm.h> |
| @@ -218,7 +219,7 @@ struct kvm_vcpu { | |||
| 218 | int fpu_active; | 219 | int fpu_active; |
| 219 | int guest_fpu_loaded, guest_xcr0_loaded; | 220 | int guest_fpu_loaded, guest_xcr0_loaded; |
| 220 | unsigned char fpu_counter; | 221 | unsigned char fpu_counter; |
| 221 | wait_queue_head_t wq; | 222 | struct swait_queue_head wq; |
| 222 | struct pid *pid; | 223 | struct pid *pid; |
| 223 | int sigset_active; | 224 | int sigset_active; |
| 224 | sigset_t sigset; | 225 | sigset_t sigset; |
| @@ -782,7 +783,7 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm) | |||
| 782 | } | 783 | } |
| 783 | #endif | 784 | #endif |
| 784 | 785 | ||
| 785 | static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) | 786 | static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) |
| 786 | { | 787 | { |
| 787 | #ifdef __KVM_HAVE_ARCH_WQP | 788 | #ifdef __KVM_HAVE_ARCH_WQP |
| 788 | return vcpu->arch.wqp; | 789 | return vcpu->arch.wqp; |
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index e23121f9d82a..59ccab297ae0 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h | |||
| @@ -37,6 +37,9 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) | |||
| 37 | 37 | ||
| 38 | void clear_all_latency_tracing(struct task_struct *p); | 38 | void clear_all_latency_tracing(struct task_struct *p); |
| 39 | 39 | ||
| 40 | extern int sysctl_latencytop(struct ctl_table *table, int write, | ||
| 41 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
| 42 | |||
| 40 | #else | 43 | #else |
| 41 | 44 | ||
| 42 | static inline void | 45 | static inline void |
diff --git a/include/linux/sched.h b/include/linux/sched.h index a10494a94cc3..838a89a78332 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -182,8 +182,6 @@ extern void update_cpu_load_nohz(int active); | |||
| 182 | static inline void update_cpu_load_nohz(int active) { } | 182 | static inline void update_cpu_load_nohz(int active) { } |
| 183 | #endif | 183 | #endif |
| 184 | 184 | ||
| 185 | extern unsigned long get_parent_ip(unsigned long addr); | ||
| 186 | |||
| 187 | extern void dump_cpu_task(int cpu); | 185 | extern void dump_cpu_task(int cpu); |
| 188 | 186 | ||
| 189 | struct seq_file; | 187 | struct seq_file; |
| @@ -920,6 +918,10 @@ static inline int sched_info_on(void) | |||
| 920 | #endif | 918 | #endif |
| 921 | } | 919 | } |
| 922 | 920 | ||
| 921 | #ifdef CONFIG_SCHEDSTATS | ||
| 922 | void force_schedstat_enabled(void); | ||
| 923 | #endif | ||
| 924 | |||
| 923 | enum cpu_idle_type { | 925 | enum cpu_idle_type { |
| 924 | CPU_IDLE, | 926 | CPU_IDLE, |
| 925 | CPU_NOT_IDLE, | 927 | CPU_NOT_IDLE, |
| @@ -1289,6 +1291,8 @@ struct sched_rt_entity { | |||
| 1289 | unsigned long timeout; | 1291 | unsigned long timeout; |
| 1290 | unsigned long watchdog_stamp; | 1292 | unsigned long watchdog_stamp; |
| 1291 | unsigned int time_slice; | 1293 | unsigned int time_slice; |
| 1294 | unsigned short on_rq; | ||
| 1295 | unsigned short on_list; | ||
| 1292 | 1296 | ||
| 1293 | struct sched_rt_entity *back; | 1297 | struct sched_rt_entity *back; |
| 1294 | #ifdef CONFIG_RT_GROUP_SCHED | 1298 | #ifdef CONFIG_RT_GROUP_SCHED |
| @@ -1329,10 +1333,6 @@ struct sched_dl_entity { | |||
| 1329 | * task has to wait for a replenishment to be performed at the | 1333 | * task has to wait for a replenishment to be performed at the |
| 1330 | * next firing of dl_timer. | 1334 | * next firing of dl_timer. |
| 1331 | * | 1335 | * |
| 1332 | * @dl_new tells if a new instance arrived. If so we must | ||
| 1333 | * start executing it with full runtime and reset its absolute | ||
| 1334 | * deadline; | ||
| 1335 | * | ||
| 1336 | * @dl_boosted tells if we are boosted due to DI. If so we are | 1336 | * @dl_boosted tells if we are boosted due to DI. If so we are |
| 1337 | * outside bandwidth enforcement mechanism (but only until we | 1337 | * outside bandwidth enforcement mechanism (but only until we |
| 1338 | * exit the critical section); | 1338 | * exit the critical section); |
| @@ -1340,7 +1340,7 @@ struct sched_dl_entity { | |||
| 1340 | * @dl_yielded tells if task gave up the cpu before consuming | 1340 | * @dl_yielded tells if task gave up the cpu before consuming |
| 1341 | * all its available runtime during the last job. | 1341 | * all its available runtime during the last job. |
| 1342 | */ | 1342 | */ |
| 1343 | int dl_throttled, dl_new, dl_boosted, dl_yielded; | 1343 | int dl_throttled, dl_boosted, dl_yielded; |
| 1344 | 1344 | ||
| 1345 | /* | 1345 | /* |
| 1346 | * Bandwidth enforcement timer. Each -deadline task has its | 1346 | * Bandwidth enforcement timer. Each -deadline task has its |
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c9e4731cf10b..4f080ab4f2cd 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h | |||
| @@ -95,4 +95,8 @@ extern int sysctl_numa_balancing(struct ctl_table *table, int write, | |||
| 95 | void __user *buffer, size_t *lenp, | 95 | void __user *buffer, size_t *lenp, |
| 96 | loff_t *ppos); | 96 | loff_t *ppos); |
| 97 | 97 | ||
| 98 | extern int sysctl_schedstats(struct ctl_table *table, int write, | ||
| 99 | void __user *buffer, size_t *lenp, | ||
| 100 | loff_t *ppos); | ||
| 101 | |||
| 98 | #endif /* _SCHED_SYSCTL_H */ | 102 | #endif /* _SCHED_SYSCTL_H */ |
diff --git a/include/linux/swait.h b/include/linux/swait.h new file mode 100644 index 000000000000..c1f9c62a8a50 --- /dev/null +++ b/include/linux/swait.h | |||
| @@ -0,0 +1,172 @@ | |||
| 1 | #ifndef _LINUX_SWAIT_H | ||
| 2 | #define _LINUX_SWAIT_H | ||
| 3 | |||
| 4 | #include <linux/list.h> | ||
| 5 | #include <linux/stddef.h> | ||
| 6 | #include <linux/spinlock.h> | ||
| 7 | #include <asm/current.h> | ||
| 8 | |||
| 9 | /* | ||
| 10 | * Simple wait queues | ||
| 11 | * | ||
| 12 | * While these are very similar to the other/complex wait queues (wait.h) the | ||
| 13 | * most important difference is that the simple waitqueue allows for | ||
| 14 | * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold | ||
| 15 | * times. | ||
| 16 | * | ||
| 17 | * In order to make this so, we had to drop a fair number of features of the | ||
| 18 | * other waitqueue code; notably: | ||
| 19 | * | ||
| 20 | * - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue; | ||
| 21 | * all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right | ||
| 22 | * sleeper state. | ||
| 23 | * | ||
| 24 | * - the exclusive mode; because this requires preserving the list order | ||
| 25 | * and this is hard. | ||
| 26 | * | ||
| 27 | * - custom wake functions; because you cannot give any guarantees about | ||
| 28 | * random code. | ||
| 29 | * | ||
| 30 | * As a side effect of this; the data structures are slimmer. | ||
| 31 | * | ||
| 32 | * One would recommend using this wait queue where possible. | ||
| 33 | */ | ||
| 34 | |||
| 35 | struct task_struct; | ||
| 36 | |||
| 37 | struct swait_queue_head { | ||
| 38 | raw_spinlock_t lock; | ||
| 39 | struct list_head task_list; | ||
| 40 | }; | ||
| 41 | |||
| 42 | struct swait_queue { | ||
| 43 | struct task_struct *task; | ||
| 44 | struct list_head task_list; | ||
| 45 | }; | ||
| 46 | |||
| 47 | #define __SWAITQUEUE_INITIALIZER(name) { \ | ||
| 48 | .task = current, \ | ||
| 49 | .task_list = LIST_HEAD_INIT((name).task_list), \ | ||
| 50 | } | ||
| 51 | |||
| 52 | #define DECLARE_SWAITQUEUE(name) \ | ||
| 53 | struct swait_queue name = __SWAITQUEUE_INITIALIZER(name) | ||
| 54 | |||
| 55 | #define __SWAIT_QUEUE_HEAD_INITIALIZER(name) { \ | ||
| 56 | .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ | ||
| 57 | .task_list = LIST_HEAD_INIT((name).task_list), \ | ||
| 58 | } | ||
| 59 | |||
| 60 | #define DECLARE_SWAIT_QUEUE_HEAD(name) \ | ||
| 61 | struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name) | ||
| 62 | |||
| 63 | extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name, | ||
| 64 | struct lock_class_key *key); | ||
| 65 | |||
| 66 | #define init_swait_queue_head(q) \ | ||
| 67 | do { \ | ||
| 68 | static struct lock_class_key __key; \ | ||
| 69 | __init_swait_queue_head((q), #q, &__key); \ | ||
| 70 | } while (0) | ||
| 71 | |||
| 72 | #ifdef CONFIG_LOCKDEP | ||
| 73 | # define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ | ||
| 74 | ({ init_swait_queue_head(&name); name; }) | ||
| 75 | # define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name) \ | ||
| 76 | struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name) | ||
| 77 | #else | ||
| 78 | # define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name) \ | ||
| 79 | DECLARE_SWAIT_QUEUE_HEAD(name) | ||
| 80 | #endif | ||
| 81 | |||
| 82 | static inline int swait_active(struct swait_queue_head *q) | ||
| 83 | { | ||
| 84 | return !list_empty(&q->task_list); | ||
| 85 | } | ||
| 86 | |||
| 87 | extern void swake_up(struct swait_queue_head *q); | ||
| 88 | extern void swake_up_all(struct swait_queue_head *q); | ||
| 89 | extern void swake_up_locked(struct swait_queue_head *q); | ||
| 90 | |||
| 91 | extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); | ||
| 92 | extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state); | ||
| 93 | extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state); | ||
| 94 | |||
| 95 | extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait); | ||
| 96 | extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait); | ||
| 97 | |||
| 98 | /* as per ___wait_event() but for swait, therefore "exclusive == 0" */ | ||
| 99 | #define ___swait_event(wq, condition, state, ret, cmd) \ | ||
| 100 | ({ \ | ||
| 101 | struct swait_queue __wait; \ | ||
| 102 | long __ret = ret; \ | ||
| 103 | \ | ||
| 104 | INIT_LIST_HEAD(&__wait.task_list); \ | ||
| 105 | for (;;) { \ | ||
| 106 | long __int = prepare_to_swait_event(&wq, &__wait, state);\ | ||
| 107 | \ | ||
| 108 | if (condition) \ | ||
| 109 | break; \ | ||
| 110 | \ | ||
| 111 | if (___wait_is_interruptible(state) && __int) { \ | ||
| 112 | __ret = __int; \ | ||
| 113 | break; \ | ||
| 114 | } \ | ||
| 115 | \ | ||
| 116 | cmd; \ | ||
| 117 | } \ | ||
| 118 | finish_swait(&wq, &__wait); \ | ||
| 119 | __ret; \ | ||
| 120 | }) | ||
| 121 | |||
| 122 | #define __swait_event(wq, condition) \ | ||
| 123 | (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \ | ||
| 124 | schedule()) | ||
| 125 | |||
| 126 | #define swait_event(wq, condition) \ | ||
| 127 | do { \ | ||
| 128 | if (condition) \ | ||
| 129 | break; \ | ||
| 130 | __swait_event(wq, condition); \ | ||
| 131 | } while (0) | ||
| 132 | |||
| 133 | #define __swait_event_timeout(wq, condition, timeout) \ | ||
| 134 | ___swait_event(wq, ___wait_cond_timeout(condition), \ | ||
| 135 | TASK_UNINTERRUPTIBLE, timeout, \ | ||
| 136 | __ret = schedule_timeout(__ret)) | ||
| 137 | |||
| 138 | #define swait_event_timeout(wq, condition, timeout) \ | ||
| 139 | ({ \ | ||
| 140 | long __ret = timeout; \ | ||
| 141 | if (!___wait_cond_timeout(condition)) \ | ||
| 142 | __ret = __swait_event_timeout(wq, condition, timeout); \ | ||
| 143 | __ret; \ | ||
| 144 | }) | ||
| 145 | |||
| 146 | #define __swait_event_interruptible(wq, condition) \ | ||
| 147 | ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \ | ||
| 148 | schedule()) | ||
| 149 | |||
| 150 | #define swait_event_interruptible(wq, condition) \ | ||
| 151 | ({ \ | ||
| 152 | int __ret = 0; \ | ||
| 153 | if (!(condition)) \ | ||
| 154 | __ret = __swait_event_interruptible(wq, condition); \ | ||
| 155 | __ret; \ | ||
| 156 | }) | ||
| 157 | |||
| 158 | #define __swait_event_interruptible_timeout(wq, condition, timeout) \ | ||
| 159 | ___swait_event(wq, ___wait_cond_timeout(condition), \ | ||
| 160 | TASK_INTERRUPTIBLE, timeout, \ | ||
| 161 | __ret = schedule_timeout(__ret)) | ||
| 162 | |||
| 163 | #define swait_event_interruptible_timeout(wq, condition, timeout) \ | ||
| 164 | ({ \ | ||
| 165 | long __ret = timeout; \ | ||
| 166 | if (!___wait_cond_timeout(condition)) \ | ||
| 167 | __ret = __swait_event_interruptible_timeout(wq, \ | ||
| 168 | condition, timeout); \ | ||
| 169 | __ret; \ | ||
| 170 | }) | ||
| 171 | |||
| 172 | #endif /* _LINUX_SWAIT_H */ | ||
diff --git a/include/linux/wait.h b/include/linux/wait.h index ae71a769b89e..27d7a0ab5da3 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h | |||
| @@ -338,7 +338,7 @@ do { \ | |||
| 338 | schedule(); try_to_freeze()) | 338 | schedule(); try_to_freeze()) |
| 339 | 339 | ||
| 340 | /** | 340 | /** |
| 341 | * wait_event - sleep (or freeze) until a condition gets true | 341 | * wait_event_freezable - sleep (or freeze) until a condition gets true |
| 342 | * @wq: the waitqueue to wait on | 342 | * @wq: the waitqueue to wait on |
| 343 | * @condition: a C expression for the event to wait for | 343 | * @condition: a C expression for the event to wait for |
| 344 | * | 344 | * |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a02812743a7e..b5c30d9f46c5 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
| @@ -47,12 +47,12 @@ | |||
| 47 | * of times) | 47 | * of times) |
| 48 | */ | 48 | */ |
| 49 | 49 | ||
| 50 | #include <linux/latencytop.h> | ||
| 51 | #include <linux/kallsyms.h> | 50 | #include <linux/kallsyms.h> |
| 52 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
| 53 | #include <linux/notifier.h> | 52 | #include <linux/notifier.h> |
| 54 | #include <linux/spinlock.h> | 53 | #include <linux/spinlock.h> |
| 55 | #include <linux/proc_fs.h> | 54 | #include <linux/proc_fs.h> |
| 55 | #include <linux/latencytop.h> | ||
| 56 | #include <linux/export.h> | 56 | #include <linux/export.h> |
| 57 | #include <linux/sched.h> | 57 | #include <linux/sched.h> |
| 58 | #include <linux/list.h> | 58 | #include <linux/list.h> |
| @@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void) | |||
| 289 | proc_create("latency_stats", 0644, NULL, &lstats_fops); | 289 | proc_create("latency_stats", 0644, NULL, &lstats_fops); |
| 290 | return 0; | 290 | return 0; |
| 291 | } | 291 | } |
| 292 | |||
| 293 | int sysctl_latencytop(struct ctl_table *table, int write, | ||
| 294 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 295 | { | ||
| 296 | int err; | ||
| 297 | |||
| 298 | err = proc_dointvec(table, write, buffer, lenp, ppos); | ||
| 299 | if (latencytop_enabled) | ||
| 300 | force_schedstat_enabled(); | ||
| 301 | |||
| 302 | return err; | ||
| 303 | } | ||
| 292 | device_initcall(init_lstats_procfs); | 304 | device_initcall(init_lstats_procfs); |
diff --git a/kernel/profile.c b/kernel/profile.c index 99513e1160e5..51369697466e 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
| @@ -59,6 +59,7 @@ int profile_setup(char *str) | |||
| 59 | 59 | ||
| 60 | if (!strncmp(str, sleepstr, strlen(sleepstr))) { | 60 | if (!strncmp(str, sleepstr, strlen(sleepstr))) { |
| 61 | #ifdef CONFIG_SCHEDSTATS | 61 | #ifdef CONFIG_SCHEDSTATS |
| 62 | force_schedstat_enabled(); | ||
| 62 | prof_on = SLEEP_PROFILING; | 63 | prof_on = SLEEP_PROFILING; |
| 63 | if (str[strlen(sleepstr)] == ',') | 64 | if (str[strlen(sleepstr)] == ',') |
| 64 | str += strlen(sleepstr) + 1; | 65 | str += strlen(sleepstr) + 1; |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e41dd4131f7a..9fd5b628a88d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
| @@ -1614,7 +1614,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | |||
| 1614 | int needmore; | 1614 | int needmore; |
| 1615 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | 1615 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
| 1616 | 1616 | ||
| 1617 | rcu_nocb_gp_cleanup(rsp, rnp); | ||
| 1618 | rnp->need_future_gp[c & 0x1] = 0; | 1617 | rnp->need_future_gp[c & 0x1] = 0; |
| 1619 | needmore = rnp->need_future_gp[(c + 1) & 0x1]; | 1618 | needmore = rnp->need_future_gp[(c + 1) & 0x1]; |
| 1620 | trace_rcu_future_gp(rnp, rdp, c, | 1619 | trace_rcu_future_gp(rnp, rdp, c, |
| @@ -1635,7 +1634,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) | |||
| 1635 | !READ_ONCE(rsp->gp_flags) || | 1634 | !READ_ONCE(rsp->gp_flags) || |
| 1636 | !rsp->gp_kthread) | 1635 | !rsp->gp_kthread) |
| 1637 | return; | 1636 | return; |
| 1638 | wake_up(&rsp->gp_wq); | 1637 | swake_up(&rsp->gp_wq); |
| 1639 | } | 1638 | } |
| 1640 | 1639 | ||
| 1641 | /* | 1640 | /* |
| @@ -2010,6 +2009,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
| 2010 | int nocb = 0; | 2009 | int nocb = 0; |
| 2011 | struct rcu_data *rdp; | 2010 | struct rcu_data *rdp; |
| 2012 | struct rcu_node *rnp = rcu_get_root(rsp); | 2011 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 2012 | struct swait_queue_head *sq; | ||
| 2013 | 2013 | ||
| 2014 | WRITE_ONCE(rsp->gp_activity, jiffies); | 2014 | WRITE_ONCE(rsp->gp_activity, jiffies); |
| 2015 | raw_spin_lock_irq_rcu_node(rnp); | 2015 | raw_spin_lock_irq_rcu_node(rnp); |
| @@ -2046,7 +2046,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
| 2046 | needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; | 2046 | needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; |
| 2047 | /* smp_mb() provided by prior unlock-lock pair. */ | 2047 | /* smp_mb() provided by prior unlock-lock pair. */ |
| 2048 | nocb += rcu_future_gp_cleanup(rsp, rnp); | 2048 | nocb += rcu_future_gp_cleanup(rsp, rnp); |
| 2049 | sq = rcu_nocb_gp_get(rnp); | ||
| 2049 | raw_spin_unlock_irq(&rnp->lock); | 2050 | raw_spin_unlock_irq(&rnp->lock); |
| 2051 | rcu_nocb_gp_cleanup(sq); | ||
| 2050 | cond_resched_rcu_qs(); | 2052 | cond_resched_rcu_qs(); |
| 2051 | WRITE_ONCE(rsp->gp_activity, jiffies); | 2053 | WRITE_ONCE(rsp->gp_activity, jiffies); |
| 2052 | rcu_gp_slow(rsp, gp_cleanup_delay); | 2054 | rcu_gp_slow(rsp, gp_cleanup_delay); |
| @@ -2092,7 +2094,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
| 2092 | READ_ONCE(rsp->gpnum), | 2094 | READ_ONCE(rsp->gpnum), |
| 2093 | TPS("reqwait")); | 2095 | TPS("reqwait")); |
| 2094 | rsp->gp_state = RCU_GP_WAIT_GPS; | 2096 | rsp->gp_state = RCU_GP_WAIT_GPS; |
| 2095 | wait_event_interruptible(rsp->gp_wq, | 2097 | swait_event_interruptible(rsp->gp_wq, |
| 2096 | READ_ONCE(rsp->gp_flags) & | 2098 | READ_ONCE(rsp->gp_flags) & |
| 2097 | RCU_GP_FLAG_INIT); | 2099 | RCU_GP_FLAG_INIT); |
| 2098 | rsp->gp_state = RCU_GP_DONE_GPS; | 2100 | rsp->gp_state = RCU_GP_DONE_GPS; |
| @@ -2122,7 +2124,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
| 2122 | READ_ONCE(rsp->gpnum), | 2124 | READ_ONCE(rsp->gpnum), |
| 2123 | TPS("fqswait")); | 2125 | TPS("fqswait")); |
| 2124 | rsp->gp_state = RCU_GP_WAIT_FQS; | 2126 | rsp->gp_state = RCU_GP_WAIT_FQS; |
| 2125 | ret = wait_event_interruptible_timeout(rsp->gp_wq, | 2127 | ret = swait_event_interruptible_timeout(rsp->gp_wq, |
| 2126 | rcu_gp_fqs_check_wake(rsp, &gf), j); | 2128 | rcu_gp_fqs_check_wake(rsp, &gf), j); |
| 2127 | rsp->gp_state = RCU_GP_DOING_FQS; | 2129 | rsp->gp_state = RCU_GP_DOING_FQS; |
| 2128 | /* Locking provides needed memory barriers. */ | 2130 | /* Locking provides needed memory barriers. */ |
| @@ -2246,7 +2248,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | |||
| 2246 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 2248 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
| 2247 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); | 2249 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); |
| 2248 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); | 2250 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); |
| 2249 | rcu_gp_kthread_wake(rsp); | 2251 | swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ |
| 2250 | } | 2252 | } |
| 2251 | 2253 | ||
| 2252 | /* | 2254 | /* |
| @@ -2900,7 +2902,7 @@ static void force_quiescent_state(struct rcu_state *rsp) | |||
| 2900 | } | 2902 | } |
| 2901 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); | 2903 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); |
| 2902 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); | 2904 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); |
| 2903 | rcu_gp_kthread_wake(rsp); | 2905 | swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ |
| 2904 | } | 2906 | } |
| 2905 | 2907 | ||
| 2906 | /* | 2908 | /* |
| @@ -3529,7 +3531,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 3529 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 3531 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 3530 | if (wake) { | 3532 | if (wake) { |
| 3531 | smp_mb(); /* EGP done before wake_up(). */ | 3533 | smp_mb(); /* EGP done before wake_up(). */ |
| 3532 | wake_up(&rsp->expedited_wq); | 3534 | swake_up(&rsp->expedited_wq); |
| 3533 | } | 3535 | } |
| 3534 | break; | 3536 | break; |
| 3535 | } | 3537 | } |
| @@ -3780,7 +3782,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | |||
| 3780 | jiffies_start = jiffies; | 3782 | jiffies_start = jiffies; |
| 3781 | 3783 | ||
| 3782 | for (;;) { | 3784 | for (;;) { |
| 3783 | ret = wait_event_interruptible_timeout( | 3785 | ret = swait_event_timeout( |
| 3784 | rsp->expedited_wq, | 3786 | rsp->expedited_wq, |
| 3785 | sync_rcu_preempt_exp_done(rnp_root), | 3787 | sync_rcu_preempt_exp_done(rnp_root), |
| 3786 | jiffies_stall); | 3788 | jiffies_stall); |
| @@ -3788,7 +3790,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | |||
| 3788 | return; | 3790 | return; |
| 3789 | if (ret < 0) { | 3791 | if (ret < 0) { |
| 3790 | /* Hit a signal, disable CPU stall warnings. */ | 3792 | /* Hit a signal, disable CPU stall warnings. */ |
| 3791 | wait_event(rsp->expedited_wq, | 3793 | swait_event(rsp->expedited_wq, |
| 3792 | sync_rcu_preempt_exp_done(rnp_root)); | 3794 | sync_rcu_preempt_exp_done(rnp_root)); |
| 3793 | return; | 3795 | return; |
| 3794 | } | 3796 | } |
| @@ -4482,8 +4484,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
| 4482 | } | 4484 | } |
| 4483 | } | 4485 | } |
| 4484 | 4486 | ||
| 4485 | init_waitqueue_head(&rsp->gp_wq); | 4487 | init_swait_queue_head(&rsp->gp_wq); |
| 4486 | init_waitqueue_head(&rsp->expedited_wq); | 4488 | init_swait_queue_head(&rsp->expedited_wq); |
| 4487 | rnp = rsp->level[rcu_num_lvls - 1]; | 4489 | rnp = rsp->level[rcu_num_lvls - 1]; |
| 4488 | for_each_possible_cpu(i) { | 4490 | for_each_possible_cpu(i) { |
| 4489 | while (i > rnp->grphi) | 4491 | while (i > rnp->grphi) |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 83360b4f4352..bbd235d0e71f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
| @@ -27,6 +27,7 @@ | |||
| 27 | #include <linux/threads.h> | 27 | #include <linux/threads.h> |
| 28 | #include <linux/cpumask.h> | 28 | #include <linux/cpumask.h> |
| 29 | #include <linux/seqlock.h> | 29 | #include <linux/seqlock.h> |
| 30 | #include <linux/swait.h> | ||
| 30 | #include <linux/stop_machine.h> | 31 | #include <linux/stop_machine.h> |
| 31 | 32 | ||
| 32 | /* | 33 | /* |
| @@ -243,7 +244,7 @@ struct rcu_node { | |||
| 243 | /* Refused to boost: not sure why, though. */ | 244 | /* Refused to boost: not sure why, though. */ |
| 244 | /* This can happen due to race conditions. */ | 245 | /* This can happen due to race conditions. */ |
| 245 | #ifdef CONFIG_RCU_NOCB_CPU | 246 | #ifdef CONFIG_RCU_NOCB_CPU |
| 246 | wait_queue_head_t nocb_gp_wq[2]; | 247 | struct swait_queue_head nocb_gp_wq[2]; |
| 247 | /* Place for rcu_nocb_kthread() to wait GP. */ | 248 | /* Place for rcu_nocb_kthread() to wait GP. */ |
| 248 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | 249 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ |
| 249 | int need_future_gp[2]; | 250 | int need_future_gp[2]; |
| @@ -399,7 +400,7 @@ struct rcu_data { | |||
| 399 | atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ | 400 | atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ |
| 400 | struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ | 401 | struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ |
| 401 | struct rcu_head **nocb_follower_tail; | 402 | struct rcu_head **nocb_follower_tail; |
| 402 | wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ | 403 | struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ |
| 403 | struct task_struct *nocb_kthread; | 404 | struct task_struct *nocb_kthread; |
| 404 | int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ | 405 | int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ |
| 405 | 406 | ||
| @@ -478,7 +479,7 @@ struct rcu_state { | |||
| 478 | unsigned long gpnum; /* Current gp number. */ | 479 | unsigned long gpnum; /* Current gp number. */ |
| 479 | unsigned long completed; /* # of last completed gp. */ | 480 | unsigned long completed; /* # of last completed gp. */ |
| 480 | struct task_struct *gp_kthread; /* Task for grace periods. */ | 481 | struct task_struct *gp_kthread; /* Task for grace periods. */ |
| 481 | wait_queue_head_t gp_wq; /* Where GP task waits. */ | 482 | struct swait_queue_head gp_wq; /* Where GP task waits. */ |
| 482 | short gp_flags; /* Commands for GP task. */ | 483 | short gp_flags; /* Commands for GP task. */ |
| 483 | short gp_state; /* GP kthread sleep state. */ | 484 | short gp_state; /* GP kthread sleep state. */ |
| 484 | 485 | ||
| @@ -506,7 +507,7 @@ struct rcu_state { | |||
| 506 | unsigned long expedited_sequence; /* Take a ticket. */ | 507 | unsigned long expedited_sequence; /* Take a ticket. */ |
| 507 | atomic_long_t expedited_normal; /* # fallbacks to normal. */ | 508 | atomic_long_t expedited_normal; /* # fallbacks to normal. */ |
| 508 | atomic_t expedited_need_qs; /* # CPUs left to check in. */ | 509 | atomic_t expedited_need_qs; /* # CPUs left to check in. */ |
| 509 | wait_queue_head_t expedited_wq; /* Wait for check-ins. */ | 510 | struct swait_queue_head expedited_wq; /* Wait for check-ins. */ |
| 510 | int ncpus_snap; /* # CPUs seen last time. */ | 511 | int ncpus_snap; /* # CPUs seen last time. */ |
| 511 | 512 | ||
| 512 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 513 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
| @@ -621,7 +622,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp); | |||
| 621 | static void increment_cpu_stall_ticks(void); | 622 | static void increment_cpu_stall_ticks(void); |
| 622 | static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); | 623 | static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); |
| 623 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); | 624 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); |
| 624 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); | 625 | static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); |
| 626 | static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); | ||
| 625 | static void rcu_init_one_nocb(struct rcu_node *rnp); | 627 | static void rcu_init_one_nocb(struct rcu_node *rnp); |
| 626 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | 628 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, |
| 627 | bool lazy, unsigned long flags); | 629 | bool lazy, unsigned long flags); |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9467a8b7e756..080bd202d360 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
| @@ -1811,9 +1811,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll); | |||
| 1811 | * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended | 1811 | * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended |
| 1812 | * grace period. | 1812 | * grace period. |
| 1813 | */ | 1813 | */ |
| 1814 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | 1814 | static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) |
| 1815 | { | 1815 | { |
| 1816 | wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); | 1816 | swake_up_all(sq); |
| 1817 | } | 1817 | } |
| 1818 | 1818 | ||
| 1819 | /* | 1819 | /* |
| @@ -1829,10 +1829,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) | |||
| 1829 | rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; | 1829 | rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; |
| 1830 | } | 1830 | } |
| 1831 | 1831 | ||
| 1832 | static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) | ||
| 1833 | { | ||
| 1834 | return &rnp->nocb_gp_wq[rnp->completed & 0x1]; | ||
| 1835 | } | ||
| 1836 | |||
| 1832 | static void rcu_init_one_nocb(struct rcu_node *rnp) | 1837 | static void rcu_init_one_nocb(struct rcu_node *rnp) |
| 1833 | { | 1838 | { |
| 1834 | init_waitqueue_head(&rnp->nocb_gp_wq[0]); | 1839 | init_swait_queue_head(&rnp->nocb_gp_wq[0]); |
| 1835 | init_waitqueue_head(&rnp->nocb_gp_wq[1]); | 1840 | init_swait_queue_head(&rnp->nocb_gp_wq[1]); |
| 1836 | } | 1841 | } |
| 1837 | 1842 | ||
| 1838 | #ifndef CONFIG_RCU_NOCB_CPU_ALL | 1843 | #ifndef CONFIG_RCU_NOCB_CPU_ALL |
| @@ -1857,7 +1862,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) | |||
| 1857 | if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { | 1862 | if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { |
| 1858 | /* Prior smp_mb__after_atomic() orders against prior enqueue. */ | 1863 | /* Prior smp_mb__after_atomic() orders against prior enqueue. */ |
| 1859 | WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); | 1864 | WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); |
| 1860 | wake_up(&rdp_leader->nocb_wq); | 1865 | swake_up(&rdp_leader->nocb_wq); |
| 1861 | } | 1866 | } |
| 1862 | } | 1867 | } |
| 1863 | 1868 | ||
| @@ -2069,7 +2074,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
| 2069 | */ | 2074 | */ |
| 2070 | trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); | 2075 | trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); |
| 2071 | for (;;) { | 2076 | for (;;) { |
| 2072 | wait_event_interruptible( | 2077 | swait_event_interruptible( |
| 2073 | rnp->nocb_gp_wq[c & 0x1], | 2078 | rnp->nocb_gp_wq[c & 0x1], |
| 2074 | (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); | 2079 | (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); |
| 2075 | if (likely(d)) | 2080 | if (likely(d)) |
| @@ -2097,7 +2102,7 @@ wait_again: | |||
| 2097 | /* Wait for callbacks to appear. */ | 2102 | /* Wait for callbacks to appear. */ |
| 2098 | if (!rcu_nocb_poll) { | 2103 | if (!rcu_nocb_poll) { |
| 2099 | trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); | 2104 | trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); |
| 2100 | wait_event_interruptible(my_rdp->nocb_wq, | 2105 | swait_event_interruptible(my_rdp->nocb_wq, |
| 2101 | !READ_ONCE(my_rdp->nocb_leader_sleep)); | 2106 | !READ_ONCE(my_rdp->nocb_leader_sleep)); |
| 2102 | /* Memory barrier handled by smp_mb() calls below and repoll. */ | 2107 | /* Memory barrier handled by smp_mb() calls below and repoll. */ |
| 2103 | } else if (firsttime) { | 2108 | } else if (firsttime) { |
| @@ -2172,7 +2177,7 @@ wait_again: | |||
| 2172 | * List was empty, wake up the follower. | 2177 | * List was empty, wake up the follower. |
| 2173 | * Memory barriers supplied by atomic_long_add(). | 2178 | * Memory barriers supplied by atomic_long_add(). |
| 2174 | */ | 2179 | */ |
| 2175 | wake_up(&rdp->nocb_wq); | 2180 | swake_up(&rdp->nocb_wq); |
| 2176 | } | 2181 | } |
| 2177 | } | 2182 | } |
| 2178 | 2183 | ||
| @@ -2193,7 +2198,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) | |||
| 2193 | if (!rcu_nocb_poll) { | 2198 | if (!rcu_nocb_poll) { |
| 2194 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 2199 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
| 2195 | "FollowerSleep"); | 2200 | "FollowerSleep"); |
| 2196 | wait_event_interruptible(rdp->nocb_wq, | 2201 | swait_event_interruptible(rdp->nocb_wq, |
| 2197 | READ_ONCE(rdp->nocb_follower_head)); | 2202 | READ_ONCE(rdp->nocb_follower_head)); |
| 2198 | } else if (firsttime) { | 2203 | } else if (firsttime) { |
| 2199 | /* Don't drown trace log with "Poll"! */ | 2204 | /* Don't drown trace log with "Poll"! */ |
| @@ -2352,7 +2357,7 @@ void __init rcu_init_nohz(void) | |||
| 2352 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | 2357 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) |
| 2353 | { | 2358 | { |
| 2354 | rdp->nocb_tail = &rdp->nocb_head; | 2359 | rdp->nocb_tail = &rdp->nocb_head; |
| 2355 | init_waitqueue_head(&rdp->nocb_wq); | 2360 | init_swait_queue_head(&rdp->nocb_wq); |
| 2356 | rdp->nocb_follower_tail = &rdp->nocb_follower_head; | 2361 | rdp->nocb_follower_tail = &rdp->nocb_follower_head; |
| 2357 | } | 2362 | } |
| 2358 | 2363 | ||
| @@ -2502,7 +2507,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) | |||
| 2502 | return false; | 2507 | return false; |
| 2503 | } | 2508 | } |
| 2504 | 2509 | ||
| 2505 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | 2510 | static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) |
| 2506 | { | 2511 | { |
| 2507 | } | 2512 | } |
| 2508 | 2513 | ||
| @@ -2510,6 +2515,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) | |||
| 2510 | { | 2515 | { |
| 2511 | } | 2516 | } |
| 2512 | 2517 | ||
| 2518 | static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) | ||
| 2519 | { | ||
| 2520 | return NULL; | ||
| 2521 | } | ||
| 2522 | |||
| 2513 | static void rcu_init_one_nocb(struct rcu_node *rnp) | 2523 | static void rcu_init_one_nocb(struct rcu_node *rnp) |
| 2514 | { | 2524 | { |
| 2515 | } | 2525 | } |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..7d4cba227cbd 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
| @@ -13,7 +13,7 @@ endif | |||
| 13 | 13 | ||
| 14 | obj-y += core.o loadavg.o clock.o cputime.o | 14 | obj-y += core.o loadavg.o clock.o cputime.o |
| 15 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o | 15 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o |
| 16 | obj-y += wait.o completion.o idle.o | 16 | obj-y += wait.o swait.o completion.o idle.o |
| 17 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o | 17 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o |
| 18 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 18 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
| 19 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 19 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 41f6b2215aa8..05114b15b6d1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -67,12 +67,10 @@ | |||
| 67 | #include <linux/pagemap.h> | 67 | #include <linux/pagemap.h> |
| 68 | #include <linux/hrtimer.h> | 68 | #include <linux/hrtimer.h> |
| 69 | #include <linux/tick.h> | 69 | #include <linux/tick.h> |
| 70 | #include <linux/debugfs.h> | ||
| 71 | #include <linux/ctype.h> | 70 | #include <linux/ctype.h> |
| 72 | #include <linux/ftrace.h> | 71 | #include <linux/ftrace.h> |
| 73 | #include <linux/slab.h> | 72 | #include <linux/slab.h> |
| 74 | #include <linux/init_task.h> | 73 | #include <linux/init_task.h> |
| 75 | #include <linux/binfmts.h> | ||
| 76 | #include <linux/context_tracking.h> | 74 | #include <linux/context_tracking.h> |
| 77 | #include <linux/compiler.h> | 75 | #include <linux/compiler.h> |
| 78 | 76 | ||
| @@ -125,138 +123,6 @@ const_debug unsigned int sysctl_sched_features = | |||
| 125 | 123 | ||
| 126 | #undef SCHED_FEAT | 124 | #undef SCHED_FEAT |
| 127 | 125 | ||
| 128 | #ifdef CONFIG_SCHED_DEBUG | ||
| 129 | #define SCHED_FEAT(name, enabled) \ | ||
| 130 | #name , | ||
| 131 | |||
| 132 | static const char * const sched_feat_names[] = { | ||
| 133 | #include "features.h" | ||
| 134 | }; | ||
| 135 | |||
| 136 | #undef SCHED_FEAT | ||
| 137 | |||
| 138 | static int sched_feat_show(struct seq_file *m, void *v) | ||
| 139 | { | ||
| 140 | int i; | ||
| 141 | |||
| 142 | for (i = 0; i < __SCHED_FEAT_NR; i++) { | ||
| 143 | if (!(sysctl_sched_features & (1UL << i))) | ||
| 144 | seq_puts(m, "NO_"); | ||
| 145 | seq_printf(m, "%s ", sched_feat_names[i]); | ||
| 146 | } | ||
| 147 | seq_puts(m, "\n"); | ||
| 148 | |||
| 149 | return 0; | ||
| 150 | } | ||
| 151 | |||
| 152 | #ifdef HAVE_JUMP_LABEL | ||
| 153 | |||
| 154 | #define jump_label_key__true STATIC_KEY_INIT_TRUE | ||
| 155 | #define jump_label_key__false STATIC_KEY_INIT_FALSE | ||
| 156 | |||
| 157 | #define SCHED_FEAT(name, enabled) \ | ||
| 158 | jump_label_key__##enabled , | ||
| 159 | |||
| 160 | struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { | ||
| 161 | #include "features.h" | ||
| 162 | }; | ||
| 163 | |||
| 164 | #undef SCHED_FEAT | ||
| 165 | |||
| 166 | static void sched_feat_disable(int i) | ||
| 167 | { | ||
| 168 | static_key_disable(&sched_feat_keys[i]); | ||
| 169 | } | ||
| 170 | |||
| 171 | static void sched_feat_enable(int i) | ||
| 172 | { | ||
| 173 | static_key_enable(&sched_feat_keys[i]); | ||
| 174 | } | ||
| 175 | #else | ||
| 176 | static void sched_feat_disable(int i) { }; | ||
| 177 | static void sched_feat_enable(int i) { }; | ||
| 178 | #endif /* HAVE_JUMP_LABEL */ | ||
| 179 | |||
| 180 | static int sched_feat_set(char *cmp) | ||
| 181 | { | ||
| 182 | int i; | ||
| 183 | int neg = 0; | ||
| 184 | |||
| 185 | if (strncmp(cmp, "NO_", 3) == 0) { | ||
| 186 | neg = 1; | ||
| 187 | cmp += 3; | ||
| 188 | } | ||
| 189 | |||
| 190 | for (i = 0; i < __SCHED_FEAT_NR; i++) { | ||
| 191 | if (strcmp(cmp, sched_feat_names[i]) == 0) { | ||
| 192 | if (neg) { | ||
| 193 | sysctl_sched_features &= ~(1UL << i); | ||
| 194 | sched_feat_disable(i); | ||
| 195 | } else { | ||
| 196 | sysctl_sched_features |= (1UL << i); | ||
| 197 | sched_feat_enable(i); | ||
| 198 | } | ||
| 199 | break; | ||
| 200 | } | ||
| 201 | } | ||
| 202 | |||
| 203 | return i; | ||
| 204 | } | ||
| 205 | |||
| 206 | static ssize_t | ||
| 207 | sched_feat_write(struct file *filp, const char __user *ubuf, | ||
| 208 | size_t cnt, loff_t *ppos) | ||
| 209 | { | ||
| 210 | char buf[64]; | ||
| 211 | char *cmp; | ||
| 212 | int i; | ||
| 213 | struct inode *inode; | ||
| 214 | |||
| 215 | if (cnt > 63) | ||
| 216 | cnt = 63; | ||
| 217 | |||
| 218 | if (copy_from_user(&buf, ubuf, cnt)) | ||
| 219 | return -EFAULT; | ||
| 220 | |||
| 221 | buf[cnt] = 0; | ||
| 222 | cmp = strstrip(buf); | ||
| 223 | |||
| 224 | /* Ensure the static_key remains in a consistent state */ | ||
| 225 | inode = file_inode(filp); | ||
| 226 | inode_lock(inode); | ||
| 227 | i = sched_feat_set(cmp); | ||
| 228 | inode_unlock(inode); | ||
| 229 | if (i == __SCHED_FEAT_NR) | ||
| 230 | return -EINVAL; | ||
| 231 | |||
| 232 | *ppos += cnt; | ||
| 233 | |||
| 234 | return cnt; | ||
| 235 | } | ||
| 236 | |||
| 237 | static int sched_feat_open(struct inode *inode, struct file *filp) | ||
| 238 | { | ||
| 239 | return single_open(filp, sched_feat_show, NULL); | ||
| 240 | } | ||
| 241 | |||
| 242 | static const struct file_operations sched_feat_fops = { | ||
| 243 | .open = sched_feat_open, | ||
| 244 | .write = sched_feat_write, | ||
| 245 | .read = seq_read, | ||
| 246 | .llseek = seq_lseek, | ||
| 247 | .release = single_release, | ||
| 248 | }; | ||
| 249 | |||
| 250 | static __init int sched_init_debug(void) | ||
| 251 | { | ||
| 252 | debugfs_create_file("sched_features", 0644, NULL, NULL, | ||
| 253 | &sched_feat_fops); | ||
| 254 | |||
| 255 | return 0; | ||
| 256 | } | ||
| 257 | late_initcall(sched_init_debug); | ||
| 258 | #endif /* CONFIG_SCHED_DEBUG */ | ||
| 259 | |||
| 260 | /* | 126 | /* |
| 261 | * Number of tasks to iterate in a single balance run. | 127 | * Number of tasks to iterate in a single balance run. |
| 262 | * Limited because this is done with IRQs disabled. | 128 | * Limited because this is done with IRQs disabled. |
| @@ -2094,7 +1960,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
| 2094 | 1960 | ||
| 2095 | ttwu_queue(p, cpu); | 1961 | ttwu_queue(p, cpu); |
| 2096 | stat: | 1962 | stat: |
| 2097 | ttwu_stat(p, cpu, wake_flags); | 1963 | if (schedstat_enabled()) |
| 1964 | ttwu_stat(p, cpu, wake_flags); | ||
| 2098 | out: | 1965 | out: |
| 2099 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 1966 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
| 2100 | 1967 | ||
| @@ -2142,7 +2009,8 @@ static void try_to_wake_up_local(struct task_struct *p) | |||
| 2142 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 2009 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
| 2143 | 2010 | ||
| 2144 | ttwu_do_wakeup(rq, p, 0); | 2011 | ttwu_do_wakeup(rq, p, 0); |
| 2145 | ttwu_stat(p, smp_processor_id(), 0); | 2012 | if (schedstat_enabled()) |
| 2013 | ttwu_stat(p, smp_processor_id(), 0); | ||
| 2146 | out: | 2014 | out: |
| 2147 | raw_spin_unlock(&p->pi_lock); | 2015 | raw_spin_unlock(&p->pi_lock); |
| 2148 | } | 2016 | } |
| @@ -2184,7 +2052,6 @@ void __dl_clear_params(struct task_struct *p) | |||
| 2184 | dl_se->dl_bw = 0; | 2052 | dl_se->dl_bw = 0; |
| 2185 | 2053 | ||
| 2186 | dl_se->dl_throttled = 0; | 2054 | dl_se->dl_throttled = 0; |
| 2187 | dl_se->dl_new = 1; | ||
| 2188 | dl_se->dl_yielded = 0; | 2055 | dl_se->dl_yielded = 0; |
| 2189 | } | 2056 | } |
| 2190 | 2057 | ||
| @@ -2211,6 +2078,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 2211 | #endif | 2078 | #endif |
| 2212 | 2079 | ||
| 2213 | #ifdef CONFIG_SCHEDSTATS | 2080 | #ifdef CONFIG_SCHEDSTATS |
| 2081 | /* Even if schedstat is disabled, there should not be garbage */ | ||
| 2214 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 2082 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
| 2215 | #endif | 2083 | #endif |
| 2216 | 2084 | ||
| @@ -2219,6 +2087,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 2219 | __dl_clear_params(p); | 2087 | __dl_clear_params(p); |
| 2220 | 2088 | ||
| 2221 | INIT_LIST_HEAD(&p->rt.run_list); | 2089 | INIT_LIST_HEAD(&p->rt.run_list); |
| 2090 | p->rt.timeout = 0; | ||
| 2091 | p->rt.time_slice = sched_rr_timeslice; | ||
| 2092 | p->rt.on_rq = 0; | ||
| 2093 | p->rt.on_list = 0; | ||
| 2222 | 2094 | ||
| 2223 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2095 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
| 2224 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2096 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
| @@ -2282,6 +2154,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, | |||
| 2282 | #endif | 2154 | #endif |
| 2283 | #endif | 2155 | #endif |
| 2284 | 2156 | ||
| 2157 | DEFINE_STATIC_KEY_FALSE(sched_schedstats); | ||
| 2158 | |||
| 2159 | #ifdef CONFIG_SCHEDSTATS | ||
| 2160 | static void set_schedstats(bool enabled) | ||
| 2161 | { | ||
| 2162 | if (enabled) | ||
| 2163 | static_branch_enable(&sched_schedstats); | ||
| 2164 | else | ||
| 2165 | static_branch_disable(&sched_schedstats); | ||
| 2166 | } | ||
| 2167 | |||
| 2168 | void force_schedstat_enabled(void) | ||
| 2169 | { | ||
| 2170 | if (!schedstat_enabled()) { | ||
| 2171 | pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); | ||
| 2172 | static_branch_enable(&sched_schedstats); | ||
| 2173 | } | ||
| 2174 | } | ||
| 2175 | |||
| 2176 | static int __init setup_schedstats(char *str) | ||
| 2177 | { | ||
| 2178 | int ret = 0; | ||
| 2179 | if (!str) | ||
| 2180 | goto out; | ||
| 2181 | |||
| 2182 | if (!strcmp(str, "enable")) { | ||
| 2183 | set_schedstats(true); | ||
| 2184 | ret = 1; | ||
| 2185 | } else if (!strcmp(str, "disable")) { | ||
| 2186 | set_schedstats(false); | ||
| 2187 | ret = 1; | ||
| 2188 | } | ||
| 2189 | out: | ||
| 2190 | if (!ret) | ||
| 2191 | pr_warn("Unable to parse schedstats=\n"); | ||
| 2192 | |||
| 2193 | return ret; | ||
| 2194 | } | ||
| 2195 | __setup("schedstats=", setup_schedstats); | ||
| 2196 | |||
| 2197 | #ifdef CONFIG_PROC_SYSCTL | ||
| 2198 | int sysctl_schedstats(struct ctl_table *table, int write, | ||
| 2199 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 2200 | { | ||
| 2201 | struct ctl_table t; | ||
| 2202 | int err; | ||
| 2203 | int state = static_branch_likely(&sched_schedstats); | ||
| 2204 | |||
| 2205 | if (write && !capable(CAP_SYS_ADMIN)) | ||
| 2206 | return -EPERM; | ||
| 2207 | |||
| 2208 | t = *table; | ||
| 2209 | t.data = &state; | ||
| 2210 | err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); | ||
| 2211 | if (err < 0) | ||
| 2212 | return err; | ||
| 2213 | if (write) | ||
| 2214 | set_schedstats(state); | ||
| 2215 | return err; | ||
| 2216 | } | ||
| 2217 | #endif | ||
| 2218 | #endif | ||
| 2219 | |||
| 2285 | /* | 2220 | /* |
| 2286 | * fork()/clone()-time setup: | 2221 | * fork()/clone()-time setup: |
| 2287 | */ | 2222 | */ |
| @@ -3011,16 +2946,6 @@ u64 scheduler_tick_max_deferment(void) | |||
| 3011 | } | 2946 | } |
| 3012 | #endif | 2947 | #endif |
| 3013 | 2948 | ||
| 3014 | notrace unsigned long get_parent_ip(unsigned long addr) | ||
| 3015 | { | ||
| 3016 | if (in_lock_functions(addr)) { | ||
| 3017 | addr = CALLER_ADDR2; | ||
| 3018 | if (in_lock_functions(addr)) | ||
| 3019 | addr = CALLER_ADDR3; | ||
| 3020 | } | ||
| 3021 | return addr; | ||
| 3022 | } | ||
| 3023 | |||
| 3024 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ | 2949 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
| 3025 | defined(CONFIG_PREEMPT_TRACER)) | 2950 | defined(CONFIG_PREEMPT_TRACER)) |
| 3026 | 2951 | ||
| @@ -3042,7 +2967,7 @@ void preempt_count_add(int val) | |||
| 3042 | PREEMPT_MASK - 10); | 2967 | PREEMPT_MASK - 10); |
| 3043 | #endif | 2968 | #endif |
| 3044 | if (preempt_count() == val) { | 2969 | if (preempt_count() == val) { |
| 3045 | unsigned long ip = get_parent_ip(CALLER_ADDR1); | 2970 | unsigned long ip = get_lock_parent_ip(); |
| 3046 | #ifdef CONFIG_DEBUG_PREEMPT | 2971 | #ifdef CONFIG_DEBUG_PREEMPT |
| 3047 | current->preempt_disable_ip = ip; | 2972 | current->preempt_disable_ip = ip; |
| 3048 | #endif | 2973 | #endif |
| @@ -3069,7 +2994,7 @@ void preempt_count_sub(int val) | |||
| 3069 | #endif | 2994 | #endif |
| 3070 | 2995 | ||
| 3071 | if (preempt_count() == val) | 2996 | if (preempt_count() == val) |
| 3072 | trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 2997 | trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); |
| 3073 | __preempt_count_sub(val); | 2998 | __preempt_count_sub(val); |
| 3074 | } | 2999 | } |
| 3075 | EXPORT_SYMBOL(preempt_count_sub); | 3000 | EXPORT_SYMBOL(preempt_count_sub); |
| @@ -3281,7 +3206,6 @@ static void __sched notrace __schedule(bool preempt) | |||
| 3281 | 3206 | ||
| 3282 | trace_sched_switch(preempt, prev, next); | 3207 | trace_sched_switch(preempt, prev, next); |
| 3283 | rq = context_switch(rq, prev, next); /* unlocks the rq */ | 3208 | rq = context_switch(rq, prev, next); /* unlocks the rq */ |
| 3284 | cpu = cpu_of(rq); | ||
| 3285 | } else { | 3209 | } else { |
| 3286 | lockdep_unpin_lock(&rq->lock); | 3210 | lockdep_unpin_lock(&rq->lock); |
| 3287 | raw_spin_unlock_irq(&rq->lock); | 3211 | raw_spin_unlock_irq(&rq->lock); |
| @@ -3467,7 +3391,7 @@ EXPORT_SYMBOL(default_wake_function); | |||
| 3467 | */ | 3391 | */ |
| 3468 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3392 | void rt_mutex_setprio(struct task_struct *p, int prio) |
| 3469 | { | 3393 | { |
| 3470 | int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; | 3394 | int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; |
| 3471 | struct rq *rq; | 3395 | struct rq *rq; |
| 3472 | const struct sched_class *prev_class; | 3396 | const struct sched_class *prev_class; |
| 3473 | 3397 | ||
| @@ -3495,11 +3419,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3495 | 3419 | ||
| 3496 | trace_sched_pi_setprio(p, prio); | 3420 | trace_sched_pi_setprio(p, prio); |
| 3497 | oldprio = p->prio; | 3421 | oldprio = p->prio; |
| 3422 | |||
| 3423 | if (oldprio == prio) | ||
| 3424 | queue_flag &= ~DEQUEUE_MOVE; | ||
| 3425 | |||
| 3498 | prev_class = p->sched_class; | 3426 | prev_class = p->sched_class; |
| 3499 | queued = task_on_rq_queued(p); | 3427 | queued = task_on_rq_queued(p); |
| 3500 | running = task_current(rq, p); | 3428 | running = task_current(rq, p); |
| 3501 | if (queued) | 3429 | if (queued) |
| 3502 | dequeue_task(rq, p, DEQUEUE_SAVE); | 3430 | dequeue_task(rq, p, queue_flag); |
| 3503 | if (running) | 3431 | if (running) |
| 3504 | put_prev_task(rq, p); | 3432 | put_prev_task(rq, p); |
| 3505 | 3433 | ||
| @@ -3517,7 +3445,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3517 | if (!dl_prio(p->normal_prio) || | 3445 | if (!dl_prio(p->normal_prio) || |
| 3518 | (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { | 3446 | (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { |
| 3519 | p->dl.dl_boosted = 1; | 3447 | p->dl.dl_boosted = 1; |
| 3520 | enqueue_flag |= ENQUEUE_REPLENISH; | 3448 | queue_flag |= ENQUEUE_REPLENISH; |
| 3521 | } else | 3449 | } else |
| 3522 | p->dl.dl_boosted = 0; | 3450 | p->dl.dl_boosted = 0; |
| 3523 | p->sched_class = &dl_sched_class; | 3451 | p->sched_class = &dl_sched_class; |
| @@ -3525,7 +3453,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3525 | if (dl_prio(oldprio)) | 3453 | if (dl_prio(oldprio)) |
| 3526 | p->dl.dl_boosted = 0; | 3454 | p->dl.dl_boosted = 0; |
| 3527 | if (oldprio < prio) | 3455 | if (oldprio < prio) |
| 3528 | enqueue_flag |= ENQUEUE_HEAD; | 3456 | queue_flag |= ENQUEUE_HEAD; |
| 3529 | p->sched_class = &rt_sched_class; | 3457 | p->sched_class = &rt_sched_class; |
| 3530 | } else { | 3458 | } else { |
| 3531 | if (dl_prio(oldprio)) | 3459 | if (dl_prio(oldprio)) |
| @@ -3540,7 +3468,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3540 | if (running) | 3468 | if (running) |
| 3541 | p->sched_class->set_curr_task(rq); | 3469 | p->sched_class->set_curr_task(rq); |
| 3542 | if (queued) | 3470 | if (queued) |
| 3543 | enqueue_task(rq, p, enqueue_flag); | 3471 | enqueue_task(rq, p, queue_flag); |
| 3544 | 3472 | ||
| 3545 | check_class_changed(rq, p, prev_class, oldprio); | 3473 | check_class_changed(rq, p, prev_class, oldprio); |
| 3546 | out_unlock: | 3474 | out_unlock: |
| @@ -3896,6 +3824,7 @@ static int __sched_setscheduler(struct task_struct *p, | |||
| 3896 | const struct sched_class *prev_class; | 3824 | const struct sched_class *prev_class; |
| 3897 | struct rq *rq; | 3825 | struct rq *rq; |
| 3898 | int reset_on_fork; | 3826 | int reset_on_fork; |
| 3827 | int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; | ||
| 3899 | 3828 | ||
| 3900 | /* may grab non-irq protected spin_locks */ | 3829 | /* may grab non-irq protected spin_locks */ |
| 3901 | BUG_ON(in_interrupt()); | 3830 | BUG_ON(in_interrupt()); |
| @@ -4078,17 +4007,14 @@ change: | |||
| 4078 | * itself. | 4007 | * itself. |
| 4079 | */ | 4008 | */ |
| 4080 | new_effective_prio = rt_mutex_get_effective_prio(p, newprio); | 4009 | new_effective_prio = rt_mutex_get_effective_prio(p, newprio); |
| 4081 | if (new_effective_prio == oldprio) { | 4010 | if (new_effective_prio == oldprio) |
| 4082 | __setscheduler_params(p, attr); | 4011 | queue_flags &= ~DEQUEUE_MOVE; |
| 4083 | task_rq_unlock(rq, p, &flags); | ||
| 4084 | return 0; | ||
| 4085 | } | ||
| 4086 | } | 4012 | } |
| 4087 | 4013 | ||
| 4088 | queued = task_on_rq_queued(p); | 4014 | queued = task_on_rq_queued(p); |
| 4089 | running = task_current(rq, p); | 4015 | running = task_current(rq, p); |
| 4090 | if (queued) | 4016 | if (queued) |
| 4091 | dequeue_task(rq, p, DEQUEUE_SAVE); | 4017 | dequeue_task(rq, p, queue_flags); |
| 4092 | if (running) | 4018 | if (running) |
| 4093 | put_prev_task(rq, p); | 4019 | put_prev_task(rq, p); |
| 4094 | 4020 | ||
| @@ -4098,15 +4024,14 @@ change: | |||
| 4098 | if (running) | 4024 | if (running) |
| 4099 | p->sched_class->set_curr_task(rq); | 4025 | p->sched_class->set_curr_task(rq); |
| 4100 | if (queued) { | 4026 | if (queued) { |
| 4101 | int enqueue_flags = ENQUEUE_RESTORE; | ||
| 4102 | /* | 4027 | /* |
| 4103 | * We enqueue to tail when the priority of a task is | 4028 | * We enqueue to tail when the priority of a task is |
| 4104 | * increased (user space view). | 4029 | * increased (user space view). |
| 4105 | */ | 4030 | */ |
| 4106 | if (oldprio <= p->prio) | 4031 | if (oldprio < p->prio) |
| 4107 | enqueue_flags |= ENQUEUE_HEAD; | 4032 | queue_flags |= ENQUEUE_HEAD; |
| 4108 | 4033 | ||
| 4109 | enqueue_task(rq, p, enqueue_flags); | 4034 | enqueue_task(rq, p, queue_flags); |
| 4110 | } | 4035 | } |
| 4111 | 4036 | ||
| 4112 | check_class_changed(rq, p, prev_class, oldprio); | 4037 | check_class_changed(rq, p, prev_class, oldprio); |
| @@ -5408,183 +5333,6 @@ static void migrate_tasks(struct rq *dead_rq) | |||
| 5408 | } | 5333 | } |
| 5409 | #endif /* CONFIG_HOTPLUG_CPU */ | 5334 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 5410 | 5335 | ||
| 5411 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | ||
| 5412 | |||
| 5413 | static struct ctl_table sd_ctl_dir[] = { | ||
| 5414 | { | ||
| 5415 | .procname = "sched_domain", | ||
| 5416 | .mode = 0555, | ||
| 5417 | }, | ||
| 5418 | {} | ||
| 5419 | }; | ||
| 5420 | |||
| 5421 | static struct ctl_table sd_ctl_root[] = { | ||
| 5422 | { | ||
| 5423 | .procname = "kernel", | ||
| 5424 | .mode = 0555, | ||
| 5425 | .child = sd_ctl_dir, | ||
| 5426 | }, | ||
| 5427 | {} | ||
| 5428 | }; | ||
| 5429 | |||
| 5430 | static struct ctl_table *sd_alloc_ctl_entry(int n) | ||
| 5431 | { | ||
| 5432 | struct ctl_table *entry = | ||
| 5433 | kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); | ||
| 5434 | |||
| 5435 | return entry; | ||
| 5436 | } | ||
| 5437 | |||
| 5438 | static void sd_free_ctl_entry(struct ctl_table **tablep) | ||
| 5439 | { | ||
| 5440 | struct ctl_table *entry; | ||
| 5441 | |||
| 5442 | /* | ||
| 5443 | * In the intermediate directories, both the child directory and | ||
| 5444 | * procname are dynamically allocated and could fail but the mode | ||
| 5445 | * will always be set. In the lowest directory the names are | ||
| 5446 | * static strings and all have proc handlers. | ||
| 5447 | */ | ||
| 5448 | for (entry = *tablep; entry->mode; entry++) { | ||
| 5449 | if (entry->child) | ||
| 5450 | sd_free_ctl_entry(&entry->child); | ||
| 5451 | if (entry->proc_handler == NULL) | ||
| 5452 | kfree(entry->procname); | ||
| 5453 | } | ||
| 5454 | |||
| 5455 | kfree(*tablep); | ||
| 5456 | *tablep = NULL; | ||
| 5457 | } | ||
| 5458 | |||
| 5459 | static int min_load_idx = 0; | ||
| 5460 | static int max_load_idx = CPU_LOAD_IDX_MAX-1; | ||
| 5461 | |||
| 5462 | static void | ||
| 5463 | set_table_entry(struct ctl_table *entry, | ||
| 5464 | const char *procname, void *data, int maxlen, | ||
| 5465 | umode_t mode, proc_handler *proc_handler, | ||
| 5466 | bool load_idx) | ||
| 5467 | { | ||
| 5468 | entry->procname = procname; | ||
| 5469 | entry->data = data; | ||
| 5470 | entry->maxlen = maxlen; | ||
| 5471 | entry->mode = mode; | ||
| 5472 | entry->proc_handler = proc_handler; | ||
| 5473 | |||
| 5474 | if (load_idx) { | ||
| 5475 | entry->extra1 = &min_load_idx; | ||
| 5476 | entry->extra2 = &max_load_idx; | ||
| 5477 | } | ||
| 5478 | } | ||
| 5479 | |||
| 5480 | static struct ctl_table * | ||
| 5481 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | ||
| 5482 | { | ||
| 5483 | struct ctl_table *table = sd_alloc_ctl_entry(14); | ||
| 5484 | |||
| 5485 | if (table == NULL) | ||
| 5486 | return NULL; | ||
| 5487 | |||
| 5488 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | ||
| 5489 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
| 5490 | set_table_entry(&table[1], "max_interval", &sd->max_interval, | ||
| 5491 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
| 5492 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, | ||
| 5493 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
| 5494 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, | ||
| 5495 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
| 5496 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, | ||
| 5497 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
| 5498 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, | ||
| 5499 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
| 5500 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, | ||
| 5501 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
| 5502 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, | ||
| 5503 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
| 5504 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | ||
| 5505 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
| 5506 | set_table_entry(&table[9], "cache_nice_tries", | ||
| 5507 | &sd->cache_nice_tries, | ||
| 5508 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
| 5509 | set_table_entry(&table[10], "flags", &sd->flags, | ||
| 5510 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
| 5511 | set_table_entry(&table[11], "max_newidle_lb_cost", | ||
| 5512 | &sd->max_newidle_lb_cost, | ||
| 5513 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
| 5514 | set_table_entry(&table[12], "name", sd->name, | ||
| 5515 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); | ||
| 5516 | /* &table[13] is terminator */ | ||
| 5517 | |||
| 5518 | return table; | ||
| 5519 | } | ||
| 5520 | |||
| 5521 | static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) | ||
| 5522 | { | ||
| 5523 | struct ctl_table *entry, *table; | ||
| 5524 | struct sched_domain *sd; | ||
| 5525 | int domain_num = 0, i; | ||
| 5526 | char buf[32]; | ||
| 5527 | |||
| 5528 | for_each_domain(cpu, sd) | ||
| 5529 | domain_num++; | ||
| 5530 | entry = table = sd_alloc_ctl_entry(domain_num + 1); | ||
| 5531 | if (table == NULL) | ||
| 5532 | return NULL; | ||
| 5533 | |||
| 5534 | i = 0; | ||
| 5535 | for_each_domain(cpu, sd) { | ||
| 5536 | snprintf(buf, 32, "domain%d", i); | ||
| 5537 | entry->procname = kstrdup(buf, GFP_KERNEL); | ||
| 5538 | entry->mode = 0555; | ||
| 5539 | entry->child = sd_alloc_ctl_domain_table(sd); | ||
| 5540 | entry++; | ||
| 5541 | i++; | ||
| 5542 | } | ||
| 5543 | return table; | ||
| 5544 | } | ||
| 5545 | |||
| 5546 | static struct ctl_table_header *sd_sysctl_header; | ||
| 5547 | static void register_sched_domain_sysctl(void) | ||
| 5548 | { | ||
| 5549 | int i, cpu_num = num_possible_cpus(); | ||
| 5550 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); | ||
| 5551 | char buf[32]; | ||
| 5552 | |||
| 5553 | WARN_ON(sd_ctl_dir[0].child); | ||
| 5554 | sd_ctl_dir[0].child = entry; | ||
| 5555 | |||
| 5556 | if (entry == NULL) | ||
| 5557 | return; | ||
| 5558 | |||
| 5559 | for_each_possible_cpu(i) { | ||
| 5560 | snprintf(buf, 32, "cpu%d", i); | ||
| 5561 | entry->procname = kstrdup(buf, GFP_KERNEL); | ||
| 5562 | entry->mode = 0555; | ||
| 5563 | entry->child = sd_alloc_ctl_cpu_table(i); | ||
| 5564 | entry++; | ||
| 5565 | } | ||
| 5566 | |||
| 5567 | WARN_ON(sd_sysctl_header); | ||
| 5568 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); | ||
| 5569 | } | ||
| 5570 | |||
| 5571 | /* may be called multiple times per register */ | ||
| 5572 | static void unregister_sched_domain_sysctl(void) | ||
| 5573 | { | ||
| 5574 | unregister_sysctl_table(sd_sysctl_header); | ||
| 5575 | sd_sysctl_header = NULL; | ||
| 5576 | if (sd_ctl_dir[0].child) | ||
| 5577 | sd_free_ctl_entry(&sd_ctl_dir[0].child); | ||
| 5578 | } | ||
| 5579 | #else | ||
| 5580 | static void register_sched_domain_sysctl(void) | ||
| 5581 | { | ||
| 5582 | } | ||
| 5583 | static void unregister_sched_domain_sysctl(void) | ||
| 5584 | { | ||
| 5585 | } | ||
| 5586 | #endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ | ||
| 5587 | |||
| 5588 | static void set_rq_online(struct rq *rq) | 5336 | static void set_rq_online(struct rq *rq) |
| 5589 | { | 5337 | { |
| 5590 | if (!rq->online) { | 5338 | if (!rq->online) { |
| @@ -6176,11 +5924,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
| 6176 | /* Setup the mask of cpus configured for isolated domains */ | 5924 | /* Setup the mask of cpus configured for isolated domains */ |
| 6177 | static int __init isolated_cpu_setup(char *str) | 5925 | static int __init isolated_cpu_setup(char *str) |
| 6178 | { | 5926 | { |
| 5927 | int ret; | ||
| 5928 | |||
| 6179 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | 5929 | alloc_bootmem_cpumask_var(&cpu_isolated_map); |
| 6180 | cpulist_parse(str, cpu_isolated_map); | 5930 | ret = cpulist_parse(str, cpu_isolated_map); |
| 5931 | if (ret) { | ||
| 5932 | pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); | ||
| 5933 | return 0; | ||
| 5934 | } | ||
| 6181 | return 1; | 5935 | return 1; |
| 6182 | } | 5936 | } |
| 6183 | |||
| 6184 | __setup("isolcpus=", isolated_cpu_setup); | 5937 | __setup("isolcpus=", isolated_cpu_setup); |
| 6185 | 5938 | ||
| 6186 | struct s_data { | 5939 | struct s_data { |
| @@ -7863,11 +7616,9 @@ void sched_destroy_group(struct task_group *tg) | |||
| 7863 | void sched_offline_group(struct task_group *tg) | 7616 | void sched_offline_group(struct task_group *tg) |
| 7864 | { | 7617 | { |
| 7865 | unsigned long flags; | 7618 | unsigned long flags; |
| 7866 | int i; | ||
| 7867 | 7619 | ||
| 7868 | /* end participation in shares distribution */ | 7620 | /* end participation in shares distribution */ |
| 7869 | for_each_possible_cpu(i) | 7621 | unregister_fair_sched_group(tg); |
| 7870 | unregister_fair_sched_group(tg, i); | ||
| 7871 | 7622 | ||
| 7872 | spin_lock_irqsave(&task_group_lock, flags); | 7623 | spin_lock_irqsave(&task_group_lock, flags); |
| 7873 | list_del_rcu(&tg->list); | 7624 | list_del_rcu(&tg->list); |
| @@ -7893,7 +7644,7 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7893 | queued = task_on_rq_queued(tsk); | 7644 | queued = task_on_rq_queued(tsk); |
| 7894 | 7645 | ||
| 7895 | if (queued) | 7646 | if (queued) |
| 7896 | dequeue_task(rq, tsk, DEQUEUE_SAVE); | 7647 | dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); |
| 7897 | if (unlikely(running)) | 7648 | if (unlikely(running)) |
| 7898 | put_prev_task(rq, tsk); | 7649 | put_prev_task(rq, tsk); |
| 7899 | 7650 | ||
| @@ -7917,7 +7668,7 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7917 | if (unlikely(running)) | 7668 | if (unlikely(running)) |
| 7918 | tsk->sched_class->set_curr_task(rq); | 7669 | tsk->sched_class->set_curr_task(rq); |
| 7919 | if (queued) | 7670 | if (queued) |
| 7920 | enqueue_task(rq, tsk, ENQUEUE_RESTORE); | 7671 | enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); |
| 7921 | 7672 | ||
| 7922 | task_rq_unlock(rq, tsk, &flags); | 7673 | task_rq_unlock(rq, tsk, &flags); |
| 7923 | } | 7674 | } |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b2ab2ffb1adc..75f98c5498d5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
| @@ -262,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void) | |||
| 262 | #ifdef CONFIG_PARAVIRT | 262 | #ifdef CONFIG_PARAVIRT |
| 263 | if (static_key_false(¶virt_steal_enabled)) { | 263 | if (static_key_false(¶virt_steal_enabled)) { |
| 264 | u64 steal; | 264 | u64 steal; |
| 265 | cputime_t steal_ct; | 265 | unsigned long steal_jiffies; |
| 266 | 266 | ||
| 267 | steal = paravirt_steal_clock(smp_processor_id()); | 267 | steal = paravirt_steal_clock(smp_processor_id()); |
| 268 | steal -= this_rq()->prev_steal_time; | 268 | steal -= this_rq()->prev_steal_time; |
| 269 | 269 | ||
| 270 | /* | 270 | /* |
| 271 | * cputime_t may be less precise than nsecs (eg: if it's | 271 | * steal is in nsecs but our caller is expecting steal |
| 272 | * based on jiffies). Lets cast the result to cputime | 272 | * time in jiffies. Lets cast the result to jiffies |
| 273 | * granularity and account the rest on the next rounds. | 273 | * granularity and account the rest on the next rounds. |
| 274 | */ | 274 | */ |
| 275 | steal_ct = nsecs_to_cputime(steal); | 275 | steal_jiffies = nsecs_to_jiffies(steal); |
| 276 | this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); | 276 | this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); |
| 277 | 277 | ||
| 278 | account_steal_time(steal_ct); | 278 | account_steal_time(jiffies_to_cputime(steal_jiffies)); |
| 279 | return steal_ct; | 279 | return steal_jiffies; |
| 280 | } | 280 | } |
| 281 | #endif | 281 | #endif |
| 282 | return false; | 282 | return false; |
| @@ -668,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime | |||
| 668 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | 668 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
| 669 | 669 | ||
| 670 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 670 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
| 671 | static unsigned long long vtime_delta(struct task_struct *tsk) | 671 | static cputime_t vtime_delta(struct task_struct *tsk) |
| 672 | { | 672 | { |
| 673 | unsigned long long clock; | 673 | unsigned long now = READ_ONCE(jiffies); |
| 674 | 674 | ||
| 675 | clock = local_clock(); | 675 | if (time_before(now, (unsigned long)tsk->vtime_snap)) |
| 676 | if (clock < tsk->vtime_snap) | ||
| 677 | return 0; | 676 | return 0; |
| 678 | 677 | ||
| 679 | return clock - tsk->vtime_snap; | 678 | return jiffies_to_cputime(now - tsk->vtime_snap); |
| 680 | } | 679 | } |
| 681 | 680 | ||
| 682 | static cputime_t get_vtime_delta(struct task_struct *tsk) | 681 | static cputime_t get_vtime_delta(struct task_struct *tsk) |
| 683 | { | 682 | { |
| 684 | unsigned long long delta = vtime_delta(tsk); | 683 | unsigned long now = READ_ONCE(jiffies); |
| 684 | unsigned long delta = now - tsk->vtime_snap; | ||
| 685 | 685 | ||
| 686 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); | 686 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); |
| 687 | tsk->vtime_snap += delta; | 687 | tsk->vtime_snap = now; |
| 688 | 688 | ||
| 689 | /* CHECKME: always safe to convert nsecs to cputime? */ | 689 | return jiffies_to_cputime(delta); |
| 690 | return nsecs_to_cputime(delta); | ||
| 691 | } | 690 | } |
| 692 | 691 | ||
| 693 | static void __vtime_account_system(struct task_struct *tsk) | 692 | static void __vtime_account_system(struct task_struct *tsk) |
| @@ -699,6 +698,9 @@ static void __vtime_account_system(struct task_struct *tsk) | |||
| 699 | 698 | ||
| 700 | void vtime_account_system(struct task_struct *tsk) | 699 | void vtime_account_system(struct task_struct *tsk) |
| 701 | { | 700 | { |
| 701 | if (!vtime_delta(tsk)) | ||
| 702 | return; | ||
| 703 | |||
| 702 | write_seqcount_begin(&tsk->vtime_seqcount); | 704 | write_seqcount_begin(&tsk->vtime_seqcount); |
| 703 | __vtime_account_system(tsk); | 705 | __vtime_account_system(tsk); |
| 704 | write_seqcount_end(&tsk->vtime_seqcount); | 706 | write_seqcount_end(&tsk->vtime_seqcount); |
| @@ -707,7 +709,8 @@ void vtime_account_system(struct task_struct *tsk) | |||
| 707 | void vtime_gen_account_irq_exit(struct task_struct *tsk) | 709 | void vtime_gen_account_irq_exit(struct task_struct *tsk) |
| 708 | { | 710 | { |
| 709 | write_seqcount_begin(&tsk->vtime_seqcount); | 711 | write_seqcount_begin(&tsk->vtime_seqcount); |
| 710 | __vtime_account_system(tsk); | 712 | if (vtime_delta(tsk)) |
| 713 | __vtime_account_system(tsk); | ||
| 711 | if (context_tracking_in_user()) | 714 | if (context_tracking_in_user()) |
| 712 | tsk->vtime_snap_whence = VTIME_USER; | 715 | tsk->vtime_snap_whence = VTIME_USER; |
| 713 | write_seqcount_end(&tsk->vtime_seqcount); | 716 | write_seqcount_end(&tsk->vtime_seqcount); |
| @@ -718,16 +721,19 @@ void vtime_account_user(struct task_struct *tsk) | |||
| 718 | cputime_t delta_cpu; | 721 | cputime_t delta_cpu; |
| 719 | 722 | ||
| 720 | write_seqcount_begin(&tsk->vtime_seqcount); | 723 | write_seqcount_begin(&tsk->vtime_seqcount); |
| 721 | delta_cpu = get_vtime_delta(tsk); | ||
| 722 | tsk->vtime_snap_whence = VTIME_SYS; | 724 | tsk->vtime_snap_whence = VTIME_SYS; |
| 723 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); | 725 | if (vtime_delta(tsk)) { |
| 726 | delta_cpu = get_vtime_delta(tsk); | ||
| 727 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); | ||
| 728 | } | ||
| 724 | write_seqcount_end(&tsk->vtime_seqcount); | 729 | write_seqcount_end(&tsk->vtime_seqcount); |
| 725 | } | 730 | } |
| 726 | 731 | ||
| 727 | void vtime_user_enter(struct task_struct *tsk) | 732 | void vtime_user_enter(struct task_struct *tsk) |
| 728 | { | 733 | { |
| 729 | write_seqcount_begin(&tsk->vtime_seqcount); | 734 | write_seqcount_begin(&tsk->vtime_seqcount); |
| 730 | __vtime_account_system(tsk); | 735 | if (vtime_delta(tsk)) |
| 736 | __vtime_account_system(tsk); | ||
| 731 | tsk->vtime_snap_whence = VTIME_USER; | 737 | tsk->vtime_snap_whence = VTIME_USER; |
| 732 | write_seqcount_end(&tsk->vtime_seqcount); | 738 | write_seqcount_end(&tsk->vtime_seqcount); |
| 733 | } | 739 | } |
| @@ -742,7 +748,8 @@ void vtime_guest_enter(struct task_struct *tsk) | |||
| 742 | * that can thus safely catch up with a tickless delta. | 748 | * that can thus safely catch up with a tickless delta. |
| 743 | */ | 749 | */ |
| 744 | write_seqcount_begin(&tsk->vtime_seqcount); | 750 | write_seqcount_begin(&tsk->vtime_seqcount); |
| 745 | __vtime_account_system(tsk); | 751 | if (vtime_delta(tsk)) |
| 752 | __vtime_account_system(tsk); | ||
| 746 | current->flags |= PF_VCPU; | 753 | current->flags |= PF_VCPU; |
| 747 | write_seqcount_end(&tsk->vtime_seqcount); | 754 | write_seqcount_end(&tsk->vtime_seqcount); |
| 748 | } | 755 | } |
| @@ -772,7 +779,7 @@ void arch_vtime_task_switch(struct task_struct *prev) | |||
| 772 | 779 | ||
| 773 | write_seqcount_begin(¤t->vtime_seqcount); | 780 | write_seqcount_begin(¤t->vtime_seqcount); |
| 774 | current->vtime_snap_whence = VTIME_SYS; | 781 | current->vtime_snap_whence = VTIME_SYS; |
| 775 | current->vtime_snap = sched_clock_cpu(smp_processor_id()); | 782 | current->vtime_snap = jiffies; |
| 776 | write_seqcount_end(¤t->vtime_seqcount); | 783 | write_seqcount_end(¤t->vtime_seqcount); |
| 777 | } | 784 | } |
| 778 | 785 | ||
| @@ -783,7 +790,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) | |||
| 783 | local_irq_save(flags); | 790 | local_irq_save(flags); |
| 784 | write_seqcount_begin(&t->vtime_seqcount); | 791 | write_seqcount_begin(&t->vtime_seqcount); |
| 785 | t->vtime_snap_whence = VTIME_SYS; | 792 | t->vtime_snap_whence = VTIME_SYS; |
| 786 | t->vtime_snap = sched_clock_cpu(cpu); | 793 | t->vtime_snap = jiffies; |
| 787 | write_seqcount_end(&t->vtime_seqcount); | 794 | write_seqcount_end(&t->vtime_seqcount); |
| 788 | local_irq_restore(flags); | 795 | local_irq_restore(flags); |
| 789 | } | 796 | } |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 57b939c81bce..c7a036facbe1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
| @@ -352,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, | |||
| 352 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | 352 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); |
| 353 | struct rq *rq = rq_of_dl_rq(dl_rq); | 353 | struct rq *rq = rq_of_dl_rq(dl_rq); |
| 354 | 354 | ||
| 355 | WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); | 355 | WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); |
| 356 | |||
| 357 | /* | ||
| 358 | * We are racing with the deadline timer. So, do nothing because | ||
| 359 | * the deadline timer handler will take care of properly recharging | ||
| 360 | * the runtime and postponing the deadline | ||
| 361 | */ | ||
| 362 | if (dl_se->dl_throttled) | ||
| 363 | return; | ||
| 356 | 364 | ||
| 357 | /* | 365 | /* |
| 358 | * We use the regular wall clock time to set deadlines in the | 366 | * We use the regular wall clock time to set deadlines in the |
| @@ -361,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, | |||
| 361 | */ | 369 | */ |
| 362 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | 370 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; |
| 363 | dl_se->runtime = pi_se->dl_runtime; | 371 | dl_se->runtime = pi_se->dl_runtime; |
| 364 | dl_se->dl_new = 0; | ||
| 365 | } | 372 | } |
| 366 | 373 | ||
| 367 | /* | 374 | /* |
| @@ -399,6 +406,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, | |||
| 399 | dl_se->runtime = pi_se->dl_runtime; | 406 | dl_se->runtime = pi_se->dl_runtime; |
| 400 | } | 407 | } |
| 401 | 408 | ||
| 409 | if (dl_se->dl_yielded && dl_se->runtime > 0) | ||
| 410 | dl_se->runtime = 0; | ||
| 411 | |||
| 402 | /* | 412 | /* |
| 403 | * We keep moving the deadline away until we get some | 413 | * We keep moving the deadline away until we get some |
| 404 | * available runtime for the entity. This ensures correct | 414 | * available runtime for the entity. This ensures correct |
| @@ -500,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, | |||
| 500 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | 510 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); |
| 501 | struct rq *rq = rq_of_dl_rq(dl_rq); | 511 | struct rq *rq = rq_of_dl_rq(dl_rq); |
| 502 | 512 | ||
| 503 | /* | ||
| 504 | * The arrival of a new instance needs special treatment, i.e., | ||
| 505 | * the actual scheduling parameters have to be "renewed". | ||
| 506 | */ | ||
| 507 | if (dl_se->dl_new) { | ||
| 508 | setup_new_dl_entity(dl_se, pi_se); | ||
| 509 | return; | ||
| 510 | } | ||
| 511 | |||
| 512 | if (dl_time_before(dl_se->deadline, rq_clock(rq)) || | 513 | if (dl_time_before(dl_se->deadline, rq_clock(rq)) || |
| 513 | dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { | 514 | dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { |
| 514 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | 515 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; |
| @@ -605,16 +606,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
| 605 | } | 606 | } |
| 606 | 607 | ||
| 607 | /* | 608 | /* |
| 608 | * This is possible if switched_from_dl() raced against a running | ||
| 609 | * callback that took the above !dl_task() path and we've since then | ||
| 610 | * switched back into SCHED_DEADLINE. | ||
| 611 | * | ||
| 612 | * There's nothing to do except drop our task reference. | ||
| 613 | */ | ||
| 614 | if (dl_se->dl_new) | ||
| 615 | goto unlock; | ||
| 616 | |||
| 617 | /* | ||
| 618 | * The task might have been boosted by someone else and might be in the | 609 | * The task might have been boosted by someone else and might be in the |
| 619 | * boosting/deboosting path, its not throttled. | 610 | * boosting/deboosting path, its not throttled. |
| 620 | */ | 611 | */ |
| @@ -735,8 +726,11 @@ static void update_curr_dl(struct rq *rq) | |||
| 735 | * approach need further study. | 726 | * approach need further study. |
| 736 | */ | 727 | */ |
| 737 | delta_exec = rq_clock_task(rq) - curr->se.exec_start; | 728 | delta_exec = rq_clock_task(rq) - curr->se.exec_start; |
| 738 | if (unlikely((s64)delta_exec <= 0)) | 729 | if (unlikely((s64)delta_exec <= 0)) { |
| 730 | if (unlikely(dl_se->dl_yielded)) | ||
| 731 | goto throttle; | ||
| 739 | return; | 732 | return; |
| 733 | } | ||
| 740 | 734 | ||
| 741 | schedstat_set(curr->se.statistics.exec_max, | 735 | schedstat_set(curr->se.statistics.exec_max, |
| 742 | max(curr->se.statistics.exec_max, delta_exec)); | 736 | max(curr->se.statistics.exec_max, delta_exec)); |
| @@ -749,8 +743,10 @@ static void update_curr_dl(struct rq *rq) | |||
| 749 | 743 | ||
| 750 | sched_rt_avg_update(rq, delta_exec); | 744 | sched_rt_avg_update(rq, delta_exec); |
| 751 | 745 | ||
| 752 | dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; | 746 | dl_se->runtime -= delta_exec; |
| 753 | if (dl_runtime_exceeded(dl_se)) { | 747 | |
| 748 | throttle: | ||
| 749 | if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { | ||
| 754 | dl_se->dl_throttled = 1; | 750 | dl_se->dl_throttled = 1; |
| 755 | __dequeue_task_dl(rq, curr, 0); | 751 | __dequeue_task_dl(rq, curr, 0); |
| 756 | if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) | 752 | if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) |
| @@ -917,7 +913,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, | |||
| 917 | * parameters of the task might need updating. Otherwise, | 913 | * parameters of the task might need updating. Otherwise, |
| 918 | * we want a replenishment of its runtime. | 914 | * we want a replenishment of its runtime. |
| 919 | */ | 915 | */ |
| 920 | if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) | 916 | if (flags & ENQUEUE_WAKEUP) |
| 921 | update_dl_entity(dl_se, pi_se); | 917 | update_dl_entity(dl_se, pi_se); |
| 922 | else if (flags & ENQUEUE_REPLENISH) | 918 | else if (flags & ENQUEUE_REPLENISH) |
| 923 | replenish_dl_entity(dl_se, pi_se); | 919 | replenish_dl_entity(dl_se, pi_se); |
| @@ -994,18 +990,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) | |||
| 994 | */ | 990 | */ |
| 995 | static void yield_task_dl(struct rq *rq) | 991 | static void yield_task_dl(struct rq *rq) |
| 996 | { | 992 | { |
| 997 | struct task_struct *p = rq->curr; | ||
| 998 | |||
| 999 | /* | 993 | /* |
| 1000 | * We make the task go to sleep until its current deadline by | 994 | * We make the task go to sleep until its current deadline by |
| 1001 | * forcing its runtime to zero. This way, update_curr_dl() stops | 995 | * forcing its runtime to zero. This way, update_curr_dl() stops |
| 1002 | * it and the bandwidth timer will wake it up and will give it | 996 | * it and the bandwidth timer will wake it up and will give it |
| 1003 | * new scheduling parameters (thanks to dl_yielded=1). | 997 | * new scheduling parameters (thanks to dl_yielded=1). |
| 1004 | */ | 998 | */ |
| 1005 | if (p->dl.runtime > 0) { | 999 | rq->curr->dl.dl_yielded = 1; |
| 1006 | rq->curr->dl.dl_yielded = 1; | 1000 | |
| 1007 | p->dl.runtime = 0; | ||
| 1008 | } | ||
| 1009 | update_rq_clock(rq); | 1001 | update_rq_clock(rq); |
| 1010 | update_curr_dl(rq); | 1002 | update_curr_dl(rq); |
| 1011 | /* | 1003 | /* |
| @@ -1722,6 +1714,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) | |||
| 1722 | */ | 1714 | */ |
| 1723 | static void switched_to_dl(struct rq *rq, struct task_struct *p) | 1715 | static void switched_to_dl(struct rq *rq, struct task_struct *p) |
| 1724 | { | 1716 | { |
| 1717 | if (dl_time_before(p->dl.deadline, rq_clock(rq))) | ||
| 1718 | setup_new_dl_entity(&p->dl, &p->dl); | ||
| 1719 | |||
| 1725 | if (task_on_rq_queued(p) && rq->curr != p) { | 1720 | if (task_on_rq_queued(p) && rq->curr != p) { |
| 1726 | #ifdef CONFIG_SMP | 1721 | #ifdef CONFIG_SMP |
| 1727 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) | 1722 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) |
| @@ -1768,8 +1763,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, | |||
| 1768 | */ | 1763 | */ |
| 1769 | resched_curr(rq); | 1764 | resched_curr(rq); |
| 1770 | #endif /* CONFIG_SMP */ | 1765 | #endif /* CONFIG_SMP */ |
| 1771 | } else | 1766 | } |
| 1772 | switched_to_dl(rq, p); | ||
| 1773 | } | 1767 | } |
| 1774 | 1768 | ||
| 1775 | const struct sched_class dl_sched_class = { | 1769 | const struct sched_class dl_sched_class = { |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 641511771ae6..4fbc3bd5ff60 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/kallsyms.h> | 16 | #include <linux/kallsyms.h> |
| 17 | #include <linux/utsname.h> | 17 | #include <linux/utsname.h> |
| 18 | #include <linux/mempolicy.h> | 18 | #include <linux/mempolicy.h> |
| 19 | #include <linux/debugfs.h> | ||
| 19 | 20 | ||
| 20 | #include "sched.h" | 21 | #include "sched.h" |
| 21 | 22 | ||
| @@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec) | |||
| 58 | 59 | ||
| 59 | #define SPLIT_NS(x) nsec_high(x), nsec_low(x) | 60 | #define SPLIT_NS(x) nsec_high(x), nsec_low(x) |
| 60 | 61 | ||
| 62 | #define SCHED_FEAT(name, enabled) \ | ||
| 63 | #name , | ||
| 64 | |||
| 65 | static const char * const sched_feat_names[] = { | ||
| 66 | #include "features.h" | ||
| 67 | }; | ||
| 68 | |||
| 69 | #undef SCHED_FEAT | ||
| 70 | |||
| 71 | static int sched_feat_show(struct seq_file *m, void *v) | ||
| 72 | { | ||
| 73 | int i; | ||
| 74 | |||
| 75 | for (i = 0; i < __SCHED_FEAT_NR; i++) { | ||
| 76 | if (!(sysctl_sched_features & (1UL << i))) | ||
| 77 | seq_puts(m, "NO_"); | ||
| 78 | seq_printf(m, "%s ", sched_feat_names[i]); | ||
| 79 | } | ||
| 80 | seq_puts(m, "\n"); | ||
| 81 | |||
| 82 | return 0; | ||
| 83 | } | ||
| 84 | |||
| 85 | #ifdef HAVE_JUMP_LABEL | ||
| 86 | |||
| 87 | #define jump_label_key__true STATIC_KEY_INIT_TRUE | ||
| 88 | #define jump_label_key__false STATIC_KEY_INIT_FALSE | ||
| 89 | |||
| 90 | #define SCHED_FEAT(name, enabled) \ | ||
| 91 | jump_label_key__##enabled , | ||
| 92 | |||
| 93 | struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { | ||
| 94 | #include "features.h" | ||
| 95 | }; | ||
| 96 | |||
| 97 | #undef SCHED_FEAT | ||
| 98 | |||
| 99 | static void sched_feat_disable(int i) | ||
| 100 | { | ||
| 101 | static_key_disable(&sched_feat_keys[i]); | ||
| 102 | } | ||
| 103 | |||
| 104 | static void sched_feat_enable(int i) | ||
| 105 | { | ||
| 106 | static_key_enable(&sched_feat_keys[i]); | ||
| 107 | } | ||
| 108 | #else | ||
| 109 | static void sched_feat_disable(int i) { }; | ||
| 110 | static void sched_feat_enable(int i) { }; | ||
| 111 | #endif /* HAVE_JUMP_LABEL */ | ||
| 112 | |||
| 113 | static int sched_feat_set(char *cmp) | ||
| 114 | { | ||
| 115 | int i; | ||
| 116 | int neg = 0; | ||
| 117 | |||
| 118 | if (strncmp(cmp, "NO_", 3) == 0) { | ||
| 119 | neg = 1; | ||
| 120 | cmp += 3; | ||
| 121 | } | ||
| 122 | |||
| 123 | for (i = 0; i < __SCHED_FEAT_NR; i++) { | ||
| 124 | if (strcmp(cmp, sched_feat_names[i]) == 0) { | ||
| 125 | if (neg) { | ||
| 126 | sysctl_sched_features &= ~(1UL << i); | ||
| 127 | sched_feat_disable(i); | ||
| 128 | } else { | ||
| 129 | sysctl_sched_features |= (1UL << i); | ||
| 130 | sched_feat_enable(i); | ||
| 131 | } | ||
| 132 | break; | ||
| 133 | } | ||
| 134 | } | ||
| 135 | |||
| 136 | return i; | ||
| 137 | } | ||
| 138 | |||
| 139 | static ssize_t | ||
| 140 | sched_feat_write(struct file *filp, const char __user *ubuf, | ||
| 141 | size_t cnt, loff_t *ppos) | ||
| 142 | { | ||
| 143 | char buf[64]; | ||
| 144 | char *cmp; | ||
| 145 | int i; | ||
| 146 | struct inode *inode; | ||
| 147 | |||
| 148 | if (cnt > 63) | ||
| 149 | cnt = 63; | ||
| 150 | |||
| 151 | if (copy_from_user(&buf, ubuf, cnt)) | ||
| 152 | return -EFAULT; | ||
| 153 | |||
| 154 | buf[cnt] = 0; | ||
| 155 | cmp = strstrip(buf); | ||
| 156 | |||
| 157 | /* Ensure the static_key remains in a consistent state */ | ||
| 158 | inode = file_inode(filp); | ||
| 159 | inode_lock(inode); | ||
| 160 | i = sched_feat_set(cmp); | ||
| 161 | inode_unlock(inode); | ||
| 162 | if (i == __SCHED_FEAT_NR) | ||
| 163 | return -EINVAL; | ||
| 164 | |||
| 165 | *ppos += cnt; | ||
| 166 | |||
| 167 | return cnt; | ||
| 168 | } | ||
| 169 | |||
| 170 | static int sched_feat_open(struct inode *inode, struct file *filp) | ||
| 171 | { | ||
| 172 | return single_open(filp, sched_feat_show, NULL); | ||
| 173 | } | ||
| 174 | |||
| 175 | static const struct file_operations sched_feat_fops = { | ||
| 176 | .open = sched_feat_open, | ||
| 177 | .write = sched_feat_write, | ||
| 178 | .read = seq_read, | ||
| 179 | .llseek = seq_lseek, | ||
| 180 | .release = single_release, | ||
| 181 | }; | ||
| 182 | |||
| 183 | static __init int sched_init_debug(void) | ||
| 184 | { | ||
| 185 | debugfs_create_file("sched_features", 0644, NULL, NULL, | ||
| 186 | &sched_feat_fops); | ||
| 187 | |||
| 188 | return 0; | ||
| 189 | } | ||
| 190 | late_initcall(sched_init_debug); | ||
| 191 | |||
| 192 | #ifdef CONFIG_SMP | ||
| 193 | |||
| 194 | #ifdef CONFIG_SYSCTL | ||
| 195 | |||
| 196 | static struct ctl_table sd_ctl_dir[] = { | ||
| 197 | { | ||
| 198 | .procname = "sched_domain", | ||
| 199 | .mode = 0555, | ||
| 200 | }, | ||
| 201 | {} | ||
| 202 | }; | ||
| 203 | |||
| 204 | static struct ctl_table sd_ctl_root[] = { | ||
| 205 | { | ||
| 206 | .procname = "kernel", | ||
| 207 | .mode = 0555, | ||
| 208 | .child = sd_ctl_dir, | ||
| 209 | }, | ||
| 210 | {} | ||
| 211 | }; | ||
| 212 | |||
| 213 | static struct ctl_table *sd_alloc_ctl_entry(int n) | ||
| 214 | { | ||
| 215 | struct ctl_table *entry = | ||
| 216 | kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); | ||
| 217 | |||
| 218 | return entry; | ||
| 219 | } | ||
| 220 | |||
| 221 | static void sd_free_ctl_entry(struct ctl_table **tablep) | ||
| 222 | { | ||
| 223 | struct ctl_table *entry; | ||
| 224 | |||
| 225 | /* | ||
| 226 | * In the intermediate directories, both the child directory and | ||
| 227 | * procname are dynamically allocated and could fail but the mode | ||
| 228 | * will always be set. In the lowest directory the names are | ||
| 229 | * static strings and all have proc handlers. | ||
| 230 | */ | ||
| 231 | for (entry = *tablep; entry->mode; entry++) { | ||
| 232 | if (entry->child) | ||
| 233 | sd_free_ctl_entry(&entry->child); | ||
| 234 | if (entry->proc_handler == NULL) | ||
| 235 | kfree(entry->procname); | ||
| 236 | } | ||
| 237 | |||
| 238 | kfree(*tablep); | ||
| 239 | *tablep = NULL; | ||
| 240 | } | ||
| 241 | |||
| 242 | static int min_load_idx = 0; | ||
| 243 | static int max_load_idx = CPU_LOAD_IDX_MAX-1; | ||
| 244 | |||
| 245 | static void | ||
| 246 | set_table_entry(struct ctl_table *entry, | ||
| 247 | const char *procname, void *data, int maxlen, | ||
| 248 | umode_t mode, proc_handler *proc_handler, | ||
| 249 | bool load_idx) | ||
| 250 | { | ||
| 251 | entry->procname = procname; | ||
| 252 | entry->data = data; | ||
| 253 | entry->maxlen = maxlen; | ||
| 254 | entry->mode = mode; | ||
| 255 | entry->proc_handler = proc_handler; | ||
| 256 | |||
| 257 | if (load_idx) { | ||
| 258 | entry->extra1 = &min_load_idx; | ||
| 259 | entry->extra2 = &max_load_idx; | ||
| 260 | } | ||
| 261 | } | ||
| 262 | |||
| 263 | static struct ctl_table * | ||
| 264 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | ||
| 265 | { | ||
| 266 | struct ctl_table *table = sd_alloc_ctl_entry(14); | ||
| 267 | |||
| 268 | if (table == NULL) | ||
| 269 | return NULL; | ||
| 270 | |||
| 271 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | ||
| 272 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
| 273 | set_table_entry(&table[1], "max_interval", &sd->max_interval, | ||
| 274 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
| 275 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, | ||
| 276 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
| 277 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, | ||
| 278 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
| 279 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, | ||
| 280 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
| 281 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, | ||
| 282 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
| 283 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, | ||
| 284 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
| 285 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, | ||
| 286 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
| 287 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | ||
| 288 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
| 289 | set_table_entry(&table[9], "cache_nice_tries", | ||
| 290 | &sd->cache_nice_tries, | ||
| 291 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
| 292 | set_table_entry(&table[10], "flags", &sd->flags, | ||
| 293 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
| 294 | set_table_entry(&table[11], "max_newidle_lb_cost", | ||
| 295 | &sd->max_newidle_lb_cost, | ||
| 296 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
| 297 | set_table_entry(&table[12], "name", sd->name, | ||
| 298 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); | ||
| 299 | /* &table[13] is terminator */ | ||
| 300 | |||
| 301 | return table; | ||
| 302 | } | ||
| 303 | |||
| 304 | static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) | ||
| 305 | { | ||
| 306 | struct ctl_table *entry, *table; | ||
| 307 | struct sched_domain *sd; | ||
| 308 | int domain_num = 0, i; | ||
| 309 | char buf[32]; | ||
| 310 | |||
| 311 | for_each_domain(cpu, sd) | ||
| 312 | domain_num++; | ||
| 313 | entry = table = sd_alloc_ctl_entry(domain_num + 1); | ||
| 314 | if (table == NULL) | ||
| 315 | return NULL; | ||
| 316 | |||
| 317 | i = 0; | ||
| 318 | for_each_domain(cpu, sd) { | ||
| 319 | snprintf(buf, 32, "domain%d", i); | ||
| 320 | entry->procname = kstrdup(buf, GFP_KERNEL); | ||
| 321 | entry->mode = 0555; | ||
| 322 | entry->child = sd_alloc_ctl_domain_table(sd); | ||
| 323 | entry++; | ||
| 324 | i++; | ||
| 325 | } | ||
| 326 | return table; | ||
| 327 | } | ||
| 328 | |||
| 329 | static struct ctl_table_header *sd_sysctl_header; | ||
| 330 | void register_sched_domain_sysctl(void) | ||
| 331 | { | ||
| 332 | int i, cpu_num = num_possible_cpus(); | ||
| 333 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); | ||
| 334 | char buf[32]; | ||
| 335 | |||
| 336 | WARN_ON(sd_ctl_dir[0].child); | ||
| 337 | sd_ctl_dir[0].child = entry; | ||
| 338 | |||
| 339 | if (entry == NULL) | ||
| 340 | return; | ||
| 341 | |||
| 342 | for_each_possible_cpu(i) { | ||
| 343 | snprintf(buf, 32, "cpu%d", i); | ||
| 344 | entry->procname = kstrdup(buf, GFP_KERNEL); | ||
| 345 | entry->mode = 0555; | ||
| 346 | entry->child = sd_alloc_ctl_cpu_table(i); | ||
| 347 | entry++; | ||
| 348 | } | ||
| 349 | |||
| 350 | WARN_ON(sd_sysctl_header); | ||
| 351 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); | ||
| 352 | } | ||
| 353 | |||
| 354 | /* may be called multiple times per register */ | ||
| 355 | void unregister_sched_domain_sysctl(void) | ||
| 356 | { | ||
| 357 | unregister_sysctl_table(sd_sysctl_header); | ||
| 358 | sd_sysctl_header = NULL; | ||
| 359 | if (sd_ctl_dir[0].child) | ||
| 360 | sd_free_ctl_entry(&sd_ctl_dir[0].child); | ||
| 361 | } | ||
| 362 | #endif /* CONFIG_SYSCTL */ | ||
| 363 | #endif /* CONFIG_SMP */ | ||
| 364 | |||
| 61 | #ifdef CONFIG_FAIR_GROUP_SCHED | 365 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 62 | static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) | 366 | static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) |
| 63 | { | 367 | { |
| @@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
| 75 | PN(se->vruntime); | 379 | PN(se->vruntime); |
| 76 | PN(se->sum_exec_runtime); | 380 | PN(se->sum_exec_runtime); |
| 77 | #ifdef CONFIG_SCHEDSTATS | 381 | #ifdef CONFIG_SCHEDSTATS |
| 78 | PN(se->statistics.wait_start); | 382 | if (schedstat_enabled()) { |
| 79 | PN(se->statistics.sleep_start); | 383 | PN(se->statistics.wait_start); |
| 80 | PN(se->statistics.block_start); | 384 | PN(se->statistics.sleep_start); |
| 81 | PN(se->statistics.sleep_max); | 385 | PN(se->statistics.block_start); |
| 82 | PN(se->statistics.block_max); | 386 | PN(se->statistics.sleep_max); |
| 83 | PN(se->statistics.exec_max); | 387 | PN(se->statistics.block_max); |
| 84 | PN(se->statistics.slice_max); | 388 | PN(se->statistics.exec_max); |
| 85 | PN(se->statistics.wait_max); | 389 | PN(se->statistics.slice_max); |
| 86 | PN(se->statistics.wait_sum); | 390 | PN(se->statistics.wait_max); |
| 87 | P(se->statistics.wait_count); | 391 | PN(se->statistics.wait_sum); |
| 392 | P(se->statistics.wait_count); | ||
| 393 | } | ||
| 88 | #endif | 394 | #endif |
| 89 | P(se->load.weight); | 395 | P(se->load.weight); |
| 90 | #ifdef CONFIG_SMP | 396 | #ifdef CONFIG_SMP |
| @@ -122,10 +428,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
| 122 | (long long)(p->nvcsw + p->nivcsw), | 428 | (long long)(p->nvcsw + p->nivcsw), |
| 123 | p->prio); | 429 | p->prio); |
| 124 | #ifdef CONFIG_SCHEDSTATS | 430 | #ifdef CONFIG_SCHEDSTATS |
| 125 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", | 431 | if (schedstat_enabled()) { |
| 126 | SPLIT_NS(p->se.statistics.wait_sum), | 432 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", |
| 127 | SPLIT_NS(p->se.sum_exec_runtime), | 433 | SPLIT_NS(p->se.statistics.wait_sum), |
| 128 | SPLIT_NS(p->se.statistics.sum_sleep_runtime)); | 434 | SPLIT_NS(p->se.sum_exec_runtime), |
| 435 | SPLIT_NS(p->se.statistics.sum_sleep_runtime)); | ||
| 436 | } | ||
| 129 | #else | 437 | #else |
| 130 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", | 438 | SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", |
| 131 | 0LL, 0L, | 439 | 0LL, 0L, |
| @@ -258,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | |||
| 258 | 566 | ||
| 259 | void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) | 567 | void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) |
| 260 | { | 568 | { |
| 569 | struct dl_bw *dl_bw; | ||
| 570 | |||
| 261 | SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); | 571 | SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); |
| 262 | SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); | 572 | SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); |
| 573 | #ifdef CONFIG_SMP | ||
| 574 | dl_bw = &cpu_rq(cpu)->rd->dl_bw; | ||
| 575 | #else | ||
| 576 | dl_bw = &dl_rq->dl_bw; | ||
| 577 | #endif | ||
| 578 | SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); | ||
| 579 | SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); | ||
| 263 | } | 580 | } |
| 264 | 581 | ||
| 265 | extern __read_mostly int sched_clock_running; | 582 | extern __read_mostly int sched_clock_running; |
| @@ -313,17 +630,18 @@ do { \ | |||
| 313 | #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); | 630 | #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); |
| 314 | #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); | 631 | #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); |
| 315 | 632 | ||
| 316 | P(yld_count); | ||
| 317 | |||
| 318 | P(sched_count); | ||
| 319 | P(sched_goidle); | ||
| 320 | #ifdef CONFIG_SMP | 633 | #ifdef CONFIG_SMP |
| 321 | P64(avg_idle); | 634 | P64(avg_idle); |
| 322 | P64(max_idle_balance_cost); | 635 | P64(max_idle_balance_cost); |
| 323 | #endif | 636 | #endif |
| 324 | 637 | ||
| 325 | P(ttwu_count); | 638 | if (schedstat_enabled()) { |
| 326 | P(ttwu_local); | 639 | P(yld_count); |
| 640 | P(sched_count); | ||
| 641 | P(sched_goidle); | ||
| 642 | P(ttwu_count); | ||
| 643 | P(ttwu_local); | ||
| 644 | } | ||
| 327 | 645 | ||
| 328 | #undef P | 646 | #undef P |
| 329 | #undef P64 | 647 | #undef P64 |
| @@ -569,38 +887,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 569 | nr_switches = p->nvcsw + p->nivcsw; | 887 | nr_switches = p->nvcsw + p->nivcsw; |
| 570 | 888 | ||
| 571 | #ifdef CONFIG_SCHEDSTATS | 889 | #ifdef CONFIG_SCHEDSTATS |
| 572 | PN(se.statistics.sum_sleep_runtime); | ||
| 573 | PN(se.statistics.wait_start); | ||
| 574 | PN(se.statistics.sleep_start); | ||
| 575 | PN(se.statistics.block_start); | ||
| 576 | PN(se.statistics.sleep_max); | ||
| 577 | PN(se.statistics.block_max); | ||
| 578 | PN(se.statistics.exec_max); | ||
| 579 | PN(se.statistics.slice_max); | ||
| 580 | PN(se.statistics.wait_max); | ||
| 581 | PN(se.statistics.wait_sum); | ||
| 582 | P(se.statistics.wait_count); | ||
| 583 | PN(se.statistics.iowait_sum); | ||
| 584 | P(se.statistics.iowait_count); | ||
| 585 | P(se.nr_migrations); | 890 | P(se.nr_migrations); |
| 586 | P(se.statistics.nr_migrations_cold); | ||
| 587 | P(se.statistics.nr_failed_migrations_affine); | ||
| 588 | P(se.statistics.nr_failed_migrations_running); | ||
| 589 | P(se.statistics.nr_failed_migrations_hot); | ||
| 590 | P(se.statistics.nr_forced_migrations); | ||
| 591 | P(se.statistics.nr_wakeups); | ||
| 592 | P(se.statistics.nr_wakeups_sync); | ||
| 593 | P(se.statistics.nr_wakeups_migrate); | ||
| 594 | P(se.statistics.nr_wakeups_local); | ||
| 595 | P(se.statistics.nr_wakeups_remote); | ||
| 596 | P(se.statistics.nr_wakeups_affine); | ||
| 597 | P(se.statistics.nr_wakeups_affine_attempts); | ||
| 598 | P(se.statistics.nr_wakeups_passive); | ||
| 599 | P(se.statistics.nr_wakeups_idle); | ||
| 600 | 891 | ||
| 601 | { | 892 | if (schedstat_enabled()) { |
| 602 | u64 avg_atom, avg_per_cpu; | 893 | u64 avg_atom, avg_per_cpu; |
| 603 | 894 | ||
| 895 | PN(se.statistics.sum_sleep_runtime); | ||
| 896 | PN(se.statistics.wait_start); | ||
| 897 | PN(se.statistics.sleep_start); | ||
| 898 | PN(se.statistics.block_start); | ||
| 899 | PN(se.statistics.sleep_max); | ||
| 900 | PN(se.statistics.block_max); | ||
| 901 | PN(se.statistics.exec_max); | ||
| 902 | PN(se.statistics.slice_max); | ||
| 903 | PN(se.statistics.wait_max); | ||
| 904 | PN(se.statistics.wait_sum); | ||
| 905 | P(se.statistics.wait_count); | ||
| 906 | PN(se.statistics.iowait_sum); | ||
| 907 | P(se.statistics.iowait_count); | ||
| 908 | P(se.statistics.nr_migrations_cold); | ||
| 909 | P(se.statistics.nr_failed_migrations_affine); | ||
| 910 | P(se.statistics.nr_failed_migrations_running); | ||
| 911 | P(se.statistics.nr_failed_migrations_hot); | ||
| 912 | P(se.statistics.nr_forced_migrations); | ||
| 913 | P(se.statistics.nr_wakeups); | ||
| 914 | P(se.statistics.nr_wakeups_sync); | ||
| 915 | P(se.statistics.nr_wakeups_migrate); | ||
| 916 | P(se.statistics.nr_wakeups_local); | ||
| 917 | P(se.statistics.nr_wakeups_remote); | ||
| 918 | P(se.statistics.nr_wakeups_affine); | ||
| 919 | P(se.statistics.nr_wakeups_affine_attempts); | ||
| 920 | P(se.statistics.nr_wakeups_passive); | ||
| 921 | P(se.statistics.nr_wakeups_idle); | ||
| 922 | |||
| 604 | avg_atom = p->se.sum_exec_runtime; | 923 | avg_atom = p->se.sum_exec_runtime; |
| 605 | if (nr_switches) | 924 | if (nr_switches) |
| 606 | avg_atom = div64_ul(avg_atom, nr_switches); | 925 | avg_atom = div64_ul(avg_atom, nr_switches); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56b7d4b83947..33130529e9b5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -20,8 +20,8 @@ | |||
| 20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra | 20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra |
| 21 | */ | 21 | */ |
| 22 | 22 | ||
| 23 | #include <linux/latencytop.h> | ||
| 24 | #include <linux/sched.h> | 23 | #include <linux/sched.h> |
| 24 | #include <linux/latencytop.h> | ||
| 25 | #include <linux/cpumask.h> | 25 | #include <linux/cpumask.h> |
| 26 | #include <linux/cpuidle.h> | 26 | #include <linux/cpuidle.h> |
| 27 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
| @@ -755,7 +755,9 @@ static void | |||
| 755 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | 755 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 756 | { | 756 | { |
| 757 | struct task_struct *p; | 757 | struct task_struct *p; |
| 758 | u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; | 758 | u64 delta; |
| 759 | |||
| 760 | delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; | ||
| 759 | 761 | ||
| 760 | if (entity_is_task(se)) { | 762 | if (entity_is_task(se)) { |
| 761 | p = task_of(se); | 763 | p = task_of(se); |
| @@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 776 | se->statistics.wait_sum += delta; | 778 | se->statistics.wait_sum += delta; |
| 777 | se->statistics.wait_start = 0; | 779 | se->statistics.wait_start = 0; |
| 778 | } | 780 | } |
| 779 | #else | ||
| 780 | static inline void | ||
| 781 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 782 | { | ||
| 783 | } | ||
| 784 | |||
| 785 | static inline void | ||
| 786 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 787 | { | ||
| 788 | } | ||
| 789 | #endif | ||
| 790 | 781 | ||
| 791 | /* | 782 | /* |
| 792 | * Task is being enqueued - update stats: | 783 | * Task is being enqueued - update stats: |
| 793 | */ | 784 | */ |
| 794 | static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 785 | static inline void |
| 786 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 795 | { | 787 | { |
| 796 | /* | 788 | /* |
| 797 | * Are we enqueueing a waiting task? (for current tasks | 789 | * Are we enqueueing a waiting task? (for current tasks |
| @@ -802,7 +794,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 802 | } | 794 | } |
| 803 | 795 | ||
| 804 | static inline void | 796 | static inline void |
| 805 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 797 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
| 806 | { | 798 | { |
| 807 | /* | 799 | /* |
| 808 | * Mark the end of the wait period if dequeueing a | 800 | * Mark the end of the wait period if dequeueing a |
| @@ -810,8 +802,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 810 | */ | 802 | */ |
| 811 | if (se != cfs_rq->curr) | 803 | if (se != cfs_rq->curr) |
| 812 | update_stats_wait_end(cfs_rq, se); | 804 | update_stats_wait_end(cfs_rq, se); |
| 805 | |||
| 806 | if (flags & DEQUEUE_SLEEP) { | ||
| 807 | if (entity_is_task(se)) { | ||
| 808 | struct task_struct *tsk = task_of(se); | ||
| 809 | |||
| 810 | if (tsk->state & TASK_INTERRUPTIBLE) | ||
| 811 | se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); | ||
| 812 | if (tsk->state & TASK_UNINTERRUPTIBLE) | ||
| 813 | se->statistics.block_start = rq_clock(rq_of(cfs_rq)); | ||
| 814 | } | ||
| 815 | } | ||
| 816 | |||
| 817 | } | ||
| 818 | #else | ||
| 819 | static inline void | ||
| 820 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 821 | { | ||
| 813 | } | 822 | } |
| 814 | 823 | ||
| 824 | static inline void | ||
| 825 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 826 | { | ||
| 827 | } | ||
| 828 | |||
| 829 | static inline void | ||
| 830 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 831 | { | ||
| 832 | } | ||
| 833 | |||
| 834 | static inline void | ||
| 835 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | ||
| 836 | { | ||
| 837 | } | ||
| 838 | #endif | ||
| 839 | |||
| 815 | /* | 840 | /* |
| 816 | * We are picking a new current task - update its stats: | 841 | * We are picking a new current task - update its stats: |
| 817 | */ | 842 | */ |
| @@ -907,10 +932,11 @@ struct numa_group { | |||
| 907 | spinlock_t lock; /* nr_tasks, tasks */ | 932 | spinlock_t lock; /* nr_tasks, tasks */ |
| 908 | int nr_tasks; | 933 | int nr_tasks; |
| 909 | pid_t gid; | 934 | pid_t gid; |
| 935 | int active_nodes; | ||
| 910 | 936 | ||
| 911 | struct rcu_head rcu; | 937 | struct rcu_head rcu; |
| 912 | nodemask_t active_nodes; | ||
| 913 | unsigned long total_faults; | 938 | unsigned long total_faults; |
| 939 | unsigned long max_faults_cpu; | ||
| 914 | /* | 940 | /* |
| 915 | * Faults_cpu is used to decide whether memory should move | 941 | * Faults_cpu is used to decide whether memory should move |
| 916 | * towards the CPU. As a consequence, these stats are weighted | 942 | * towards the CPU. As a consequence, these stats are weighted |
| @@ -969,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) | |||
| 969 | group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; | 995 | group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; |
| 970 | } | 996 | } |
| 971 | 997 | ||
| 998 | /* | ||
| 999 | * A node triggering more than 1/3 as many NUMA faults as the maximum is | ||
| 1000 | * considered part of a numa group's pseudo-interleaving set. Migrations | ||
| 1001 | * between these nodes are slowed down, to allow things to settle down. | ||
| 1002 | */ | ||
| 1003 | #define ACTIVE_NODE_FRACTION 3 | ||
| 1004 | |||
| 1005 | static bool numa_is_active_node(int nid, struct numa_group *ng) | ||
| 1006 | { | ||
| 1007 | return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; | ||
| 1008 | } | ||
| 1009 | |||
| 972 | /* Handle placement on systems where not all nodes are directly connected. */ | 1010 | /* Handle placement on systems where not all nodes are directly connected. */ |
| 973 | static unsigned long score_nearby_nodes(struct task_struct *p, int nid, | 1011 | static unsigned long score_nearby_nodes(struct task_struct *p, int nid, |
| 974 | int maxdist, bool task) | 1012 | int maxdist, bool task) |
| @@ -1118,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, | |||
| 1118 | return true; | 1156 | return true; |
| 1119 | 1157 | ||
| 1120 | /* | 1158 | /* |
| 1121 | * Do not migrate if the destination is not a node that | 1159 | * Destination node is much more heavily used than the source |
| 1122 | * is actively used by this numa group. | 1160 | * node? Allow migration. |
| 1123 | */ | 1161 | */ |
| 1124 | if (!node_isset(dst_nid, ng->active_nodes)) | 1162 | if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) * |
| 1125 | return false; | 1163 | ACTIVE_NODE_FRACTION) |
| 1126 | |||
| 1127 | /* | ||
| 1128 | * Source is a node that is not actively used by this | ||
| 1129 | * numa group, while the destination is. Migrate. | ||
| 1130 | */ | ||
| 1131 | if (!node_isset(src_nid, ng->active_nodes)) | ||
| 1132 | return true; | 1164 | return true; |
| 1133 | 1165 | ||
| 1134 | /* | 1166 | /* |
| 1135 | * Both source and destination are nodes in active | 1167 | * Distribute memory according to CPU & memory use on each node, |
| 1136 | * use by this numa group. Maximize memory bandwidth | 1168 | * with 3/4 hysteresis to avoid unnecessary memory migrations: |
| 1137 | * by migrating from more heavily used groups, to less | 1169 | * |
| 1138 | * heavily used ones, spreading the load around. | 1170 | * faults_cpu(dst) 3 faults_cpu(src) |
| 1139 | * Use a 1/4 hysteresis to avoid spurious page movement. | 1171 | * --------------- * - > --------------- |
| 1172 | * faults_mem(dst) 4 faults_mem(src) | ||
| 1140 | */ | 1173 | */ |
| 1141 | return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); | 1174 | return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 > |
| 1175 | group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; | ||
| 1142 | } | 1176 | } |
| 1143 | 1177 | ||
| 1144 | static unsigned long weighted_cpuload(const int cpu); | 1178 | static unsigned long weighted_cpuload(const int cpu); |
| @@ -1484,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
| 1484 | 1518 | ||
| 1485 | .best_task = NULL, | 1519 | .best_task = NULL, |
| 1486 | .best_imp = 0, | 1520 | .best_imp = 0, |
| 1487 | .best_cpu = -1 | 1521 | .best_cpu = -1, |
| 1488 | }; | 1522 | }; |
| 1489 | struct sched_domain *sd; | 1523 | struct sched_domain *sd; |
| 1490 | unsigned long taskweight, groupweight; | 1524 | unsigned long taskweight, groupweight; |
| @@ -1536,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
| 1536 | * multiple NUMA nodes; in order to better consolidate the group, | 1570 | * multiple NUMA nodes; in order to better consolidate the group, |
| 1537 | * we need to check other locations. | 1571 | * we need to check other locations. |
| 1538 | */ | 1572 | */ |
| 1539 | if (env.best_cpu == -1 || (p->numa_group && | 1573 | if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) { |
| 1540 | nodes_weight(p->numa_group->active_nodes) > 1)) { | ||
| 1541 | for_each_online_node(nid) { | 1574 | for_each_online_node(nid) { |
| 1542 | if (nid == env.src_nid || nid == p->numa_preferred_nid) | 1575 | if (nid == env.src_nid || nid == p->numa_preferred_nid) |
| 1543 | continue; | 1576 | continue; |
| @@ -1572,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p) | |||
| 1572 | * trying for a better one later. Do not set the preferred node here. | 1605 | * trying for a better one later. Do not set the preferred node here. |
| 1573 | */ | 1606 | */ |
| 1574 | if (p->numa_group) { | 1607 | if (p->numa_group) { |
| 1608 | struct numa_group *ng = p->numa_group; | ||
| 1609 | |||
| 1575 | if (env.best_cpu == -1) | 1610 | if (env.best_cpu == -1) |
| 1576 | nid = env.src_nid; | 1611 | nid = env.src_nid; |
| 1577 | else | 1612 | else |
| 1578 | nid = env.dst_nid; | 1613 | nid = env.dst_nid; |
| 1579 | 1614 | ||
| 1580 | if (node_isset(nid, p->numa_group->active_nodes)) | 1615 | if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) |
| 1581 | sched_setnuma(p, env.dst_nid); | 1616 | sched_setnuma(p, env.dst_nid); |
| 1582 | } | 1617 | } |
| 1583 | 1618 | ||
| @@ -1627,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
| 1627 | } | 1662 | } |
| 1628 | 1663 | ||
| 1629 | /* | 1664 | /* |
| 1630 | * Find the nodes on which the workload is actively running. We do this by | 1665 | * Find out how many nodes on the workload is actively running on. Do this by |
| 1631 | * tracking the nodes from which NUMA hinting faults are triggered. This can | 1666 | * tracking the nodes from which NUMA hinting faults are triggered. This can |
| 1632 | * be different from the set of nodes where the workload's memory is currently | 1667 | * be different from the set of nodes where the workload's memory is currently |
| 1633 | * located. | 1668 | * located. |
| 1634 | * | ||
| 1635 | * The bitmask is used to make smarter decisions on when to do NUMA page | ||
| 1636 | * migrations, To prevent flip-flopping, and excessive page migrations, nodes | ||
| 1637 | * are added when they cause over 6/16 of the maximum number of faults, but | ||
| 1638 | * only removed when they drop below 3/16. | ||
| 1639 | */ | 1669 | */ |
| 1640 | static void update_numa_active_node_mask(struct numa_group *numa_group) | 1670 | static void numa_group_count_active_nodes(struct numa_group *numa_group) |
| 1641 | { | 1671 | { |
| 1642 | unsigned long faults, max_faults = 0; | 1672 | unsigned long faults, max_faults = 0; |
| 1643 | int nid; | 1673 | int nid, active_nodes = 0; |
| 1644 | 1674 | ||
| 1645 | for_each_online_node(nid) { | 1675 | for_each_online_node(nid) { |
| 1646 | faults = group_faults_cpu(numa_group, nid); | 1676 | faults = group_faults_cpu(numa_group, nid); |
| @@ -1650,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) | |||
| 1650 | 1680 | ||
| 1651 | for_each_online_node(nid) { | 1681 | for_each_online_node(nid) { |
| 1652 | faults = group_faults_cpu(numa_group, nid); | 1682 | faults = group_faults_cpu(numa_group, nid); |
| 1653 | if (!node_isset(nid, numa_group->active_nodes)) { | 1683 | if (faults * ACTIVE_NODE_FRACTION > max_faults) |
| 1654 | if (faults > max_faults * 6 / 16) | 1684 | active_nodes++; |
| 1655 | node_set(nid, numa_group->active_nodes); | ||
| 1656 | } else if (faults < max_faults * 3 / 16) | ||
| 1657 | node_clear(nid, numa_group->active_nodes); | ||
| 1658 | } | 1685 | } |
| 1686 | |||
| 1687 | numa_group->max_faults_cpu = max_faults; | ||
| 1688 | numa_group->active_nodes = active_nodes; | ||
| 1659 | } | 1689 | } |
| 1660 | 1690 | ||
| 1661 | /* | 1691 | /* |
| @@ -1946,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p) | |||
| 1946 | update_task_scan_period(p, fault_types[0], fault_types[1]); | 1976 | update_task_scan_period(p, fault_types[0], fault_types[1]); |
| 1947 | 1977 | ||
| 1948 | if (p->numa_group) { | 1978 | if (p->numa_group) { |
| 1949 | update_numa_active_node_mask(p->numa_group); | 1979 | numa_group_count_active_nodes(p->numa_group); |
| 1950 | spin_unlock_irq(group_lock); | 1980 | spin_unlock_irq(group_lock); |
| 1951 | max_nid = preferred_group_nid(p, max_group_nid); | 1981 | max_nid = preferred_group_nid(p, max_group_nid); |
| 1952 | } | 1982 | } |
| @@ -1990,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
| 1990 | return; | 2020 | return; |
| 1991 | 2021 | ||
| 1992 | atomic_set(&grp->refcount, 1); | 2022 | atomic_set(&grp->refcount, 1); |
| 2023 | grp->active_nodes = 1; | ||
| 2024 | grp->max_faults_cpu = 0; | ||
| 1993 | spin_lock_init(&grp->lock); | 2025 | spin_lock_init(&grp->lock); |
| 1994 | grp->gid = p->pid; | 2026 | grp->gid = p->pid; |
| 1995 | /* Second half of the array tracks nids where faults happen */ | 2027 | /* Second half of the array tracks nids where faults happen */ |
| 1996 | grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * | 2028 | grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * |
| 1997 | nr_node_ids; | 2029 | nr_node_ids; |
| 1998 | 2030 | ||
| 1999 | node_set(task_node(current), grp->active_nodes); | ||
| 2000 | |||
| 2001 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) | 2031 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) |
| 2002 | grp->faults[i] = p->numa_faults[i]; | 2032 | grp->faults[i] = p->numa_faults[i]; |
| 2003 | 2033 | ||
| @@ -2111,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
| 2111 | bool migrated = flags & TNF_MIGRATED; | 2141 | bool migrated = flags & TNF_MIGRATED; |
| 2112 | int cpu_node = task_node(current); | 2142 | int cpu_node = task_node(current); |
| 2113 | int local = !!(flags & TNF_FAULT_LOCAL); | 2143 | int local = !!(flags & TNF_FAULT_LOCAL); |
| 2144 | struct numa_group *ng; | ||
| 2114 | int priv; | 2145 | int priv; |
| 2115 | 2146 | ||
| 2116 | if (!static_branch_likely(&sched_numa_balancing)) | 2147 | if (!static_branch_likely(&sched_numa_balancing)) |
| @@ -2151,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
| 2151 | * actively using should be counted as local. This allows the | 2182 | * actively using should be counted as local. This allows the |
| 2152 | * scan rate to slow down when a workload has settled down. | 2183 | * scan rate to slow down when a workload has settled down. |
| 2153 | */ | 2184 | */ |
| 2154 | if (!priv && !local && p->numa_group && | 2185 | ng = p->numa_group; |
| 2155 | node_isset(cpu_node, p->numa_group->active_nodes) && | 2186 | if (!priv && !local && ng && ng->active_nodes > 1 && |
| 2156 | node_isset(mem_node, p->numa_group->active_nodes)) | 2187 | numa_is_active_node(cpu_node, ng) && |
| 2188 | numa_is_active_node(mem_node, ng)) | ||
| 2157 | local = 1; | 2189 | local = 1; |
| 2158 | 2190 | ||
| 2159 | task_numa_placement(p); | 2191 | task_numa_placement(p); |
| @@ -3102,6 +3134,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
| 3102 | 3134 | ||
| 3103 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq); | 3135 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq); |
| 3104 | 3136 | ||
| 3137 | static inline void check_schedstat_required(void) | ||
| 3138 | { | ||
| 3139 | #ifdef CONFIG_SCHEDSTATS | ||
| 3140 | if (schedstat_enabled()) | ||
| 3141 | return; | ||
| 3142 | |||
| 3143 | /* Force schedstat enabled if a dependent tracepoint is active */ | ||
| 3144 | if (trace_sched_stat_wait_enabled() || | ||
| 3145 | trace_sched_stat_sleep_enabled() || | ||
| 3146 | trace_sched_stat_iowait_enabled() || | ||
| 3147 | trace_sched_stat_blocked_enabled() || | ||
| 3148 | trace_sched_stat_runtime_enabled()) { | ||
| 3149 | pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " | ||
| 3150 | "stat_blocked and stat_runtime require the " | ||
| 3151 | "kernel parameter schedstats=enabled or " | ||
| 3152 | "kernel.sched_schedstats=1\n"); | ||
| 3153 | } | ||
| 3154 | #endif | ||
| 3155 | } | ||
| 3156 | |||
| 3105 | static void | 3157 | static void |
| 3106 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 3158 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
| 3107 | { | 3159 | { |
| @@ -3122,11 +3174,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 3122 | 3174 | ||
| 3123 | if (flags & ENQUEUE_WAKEUP) { | 3175 | if (flags & ENQUEUE_WAKEUP) { |
| 3124 | place_entity(cfs_rq, se, 0); | 3176 | place_entity(cfs_rq, se, 0); |
| 3125 | enqueue_sleeper(cfs_rq, se); | 3177 | if (schedstat_enabled()) |
| 3178 | enqueue_sleeper(cfs_rq, se); | ||
| 3126 | } | 3179 | } |
| 3127 | 3180 | ||
| 3128 | update_stats_enqueue(cfs_rq, se); | 3181 | check_schedstat_required(); |
| 3129 | check_spread(cfs_rq, se); | 3182 | if (schedstat_enabled()) { |
| 3183 | update_stats_enqueue(cfs_rq, se); | ||
| 3184 | check_spread(cfs_rq, se); | ||
| 3185 | } | ||
| 3130 | if (se != cfs_rq->curr) | 3186 | if (se != cfs_rq->curr) |
| 3131 | __enqueue_entity(cfs_rq, se); | 3187 | __enqueue_entity(cfs_rq, se); |
| 3132 | se->on_rq = 1; | 3188 | se->on_rq = 1; |
| @@ -3193,19 +3249,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 3193 | update_curr(cfs_rq); | 3249 | update_curr(cfs_rq); |
| 3194 | dequeue_entity_load_avg(cfs_rq, se); | 3250 | dequeue_entity_load_avg(cfs_rq, se); |
| 3195 | 3251 | ||
| 3196 | update_stats_dequeue(cfs_rq, se); | 3252 | if (schedstat_enabled()) |
| 3197 | if (flags & DEQUEUE_SLEEP) { | 3253 | update_stats_dequeue(cfs_rq, se, flags); |
| 3198 | #ifdef CONFIG_SCHEDSTATS | ||
| 3199 | if (entity_is_task(se)) { | ||
| 3200 | struct task_struct *tsk = task_of(se); | ||
| 3201 | |||
| 3202 | if (tsk->state & TASK_INTERRUPTIBLE) | ||
| 3203 | se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); | ||
| 3204 | if (tsk->state & TASK_UNINTERRUPTIBLE) | ||
| 3205 | se->statistics.block_start = rq_clock(rq_of(cfs_rq)); | ||
| 3206 | } | ||
| 3207 | #endif | ||
| 3208 | } | ||
| 3209 | 3254 | ||
| 3210 | clear_buddies(cfs_rq, se); | 3255 | clear_buddies(cfs_rq, se); |
| 3211 | 3256 | ||
| @@ -3279,7 +3324,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 3279 | * a CPU. So account for the time it spent waiting on the | 3324 | * a CPU. So account for the time it spent waiting on the |
| 3280 | * runqueue. | 3325 | * runqueue. |
| 3281 | */ | 3326 | */ |
| 3282 | update_stats_wait_end(cfs_rq, se); | 3327 | if (schedstat_enabled()) |
| 3328 | update_stats_wait_end(cfs_rq, se); | ||
| 3283 | __dequeue_entity(cfs_rq, se); | 3329 | __dequeue_entity(cfs_rq, se); |
| 3284 | update_load_avg(se, 1); | 3330 | update_load_avg(se, 1); |
| 3285 | } | 3331 | } |
| @@ -3292,7 +3338,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 3292 | * least twice that of our own weight (i.e. dont track it | 3338 | * least twice that of our own weight (i.e. dont track it |
| 3293 | * when there are only lesser-weight tasks around): | 3339 | * when there are only lesser-weight tasks around): |
| 3294 | */ | 3340 | */ |
| 3295 | if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { | 3341 | if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { |
| 3296 | se->statistics.slice_max = max(se->statistics.slice_max, | 3342 | se->statistics.slice_max = max(se->statistics.slice_max, |
| 3297 | se->sum_exec_runtime - se->prev_sum_exec_runtime); | 3343 | se->sum_exec_runtime - se->prev_sum_exec_runtime); |
| 3298 | } | 3344 | } |
| @@ -3375,9 +3421,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
| 3375 | /* throttle cfs_rqs exceeding runtime */ | 3421 | /* throttle cfs_rqs exceeding runtime */ |
| 3376 | check_cfs_rq_runtime(cfs_rq); | 3422 | check_cfs_rq_runtime(cfs_rq); |
| 3377 | 3423 | ||
| 3378 | check_spread(cfs_rq, prev); | 3424 | if (schedstat_enabled()) { |
| 3425 | check_spread(cfs_rq, prev); | ||
| 3426 | if (prev->on_rq) | ||
| 3427 | update_stats_wait_start(cfs_rq, prev); | ||
| 3428 | } | ||
| 3429 | |||
| 3379 | if (prev->on_rq) { | 3430 | if (prev->on_rq) { |
| 3380 | update_stats_wait_start(cfs_rq, prev); | ||
| 3381 | /* Put 'current' back into the tree. */ | 3431 | /* Put 'current' back into the tree. */ |
| 3382 | __enqueue_entity(cfs_rq, prev); | 3432 | __enqueue_entity(cfs_rq, prev); |
| 3383 | /* in !on_rq case, update occurred at dequeue */ | 3433 | /* in !on_rq case, update occurred at dequeue */ |
| @@ -4459,9 +4509,17 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | |||
| 4459 | 4509 | ||
| 4460 | /* scale is effectively 1 << i now, and >> i divides by scale */ | 4510 | /* scale is effectively 1 << i now, and >> i divides by scale */ |
| 4461 | 4511 | ||
| 4462 | old_load = this_rq->cpu_load[i] - tickless_load; | 4512 | old_load = this_rq->cpu_load[i]; |
| 4463 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | 4513 | old_load = decay_load_missed(old_load, pending_updates - 1, i); |
| 4464 | old_load += tickless_load; | 4514 | if (tickless_load) { |
| 4515 | old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); | ||
| 4516 | /* | ||
| 4517 | * old_load can never be a negative value because a | ||
| 4518 | * decayed tickless_load cannot be greater than the | ||
| 4519 | * original tickless_load. | ||
| 4520 | */ | ||
| 4521 | old_load += tickless_load; | ||
| 4522 | } | ||
| 4465 | new_load = this_load; | 4523 | new_load = this_load; |
| 4466 | /* | 4524 | /* |
| 4467 | * Round up the averaging division if load is increasing. This | 4525 | * Round up the averaging division if load is increasing. This |
| @@ -4484,6 +4542,25 @@ static unsigned long weighted_cpuload(const int cpu) | |||
| 4484 | } | 4542 | } |
| 4485 | 4543 | ||
| 4486 | #ifdef CONFIG_NO_HZ_COMMON | 4544 | #ifdef CONFIG_NO_HZ_COMMON |
| 4545 | static void __update_cpu_load_nohz(struct rq *this_rq, | ||
| 4546 | unsigned long curr_jiffies, | ||
| 4547 | unsigned long load, | ||
| 4548 | int active) | ||
| 4549 | { | ||
| 4550 | unsigned long pending_updates; | ||
| 4551 | |||
| 4552 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
| 4553 | if (pending_updates) { | ||
| 4554 | this_rq->last_load_update_tick = curr_jiffies; | ||
| 4555 | /* | ||
| 4556 | * In the regular NOHZ case, we were idle, this means load 0. | ||
| 4557 | * In the NOHZ_FULL case, we were non-idle, we should consider | ||
| 4558 | * its weighted load. | ||
| 4559 | */ | ||
| 4560 | __update_cpu_load(this_rq, load, pending_updates, active); | ||
| 4561 | } | ||
| 4562 | } | ||
| 4563 | |||
| 4487 | /* | 4564 | /* |
| 4488 | * There is no sane way to deal with nohz on smp when using jiffies because the | 4565 | * There is no sane way to deal with nohz on smp when using jiffies because the |
| 4489 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | 4566 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading |
| @@ -4501,22 +4578,15 @@ static unsigned long weighted_cpuload(const int cpu) | |||
| 4501 | * Called from nohz_idle_balance() to update the load ratings before doing the | 4578 | * Called from nohz_idle_balance() to update the load ratings before doing the |
| 4502 | * idle balance. | 4579 | * idle balance. |
| 4503 | */ | 4580 | */ |
| 4504 | static void update_idle_cpu_load(struct rq *this_rq) | 4581 | static void update_cpu_load_idle(struct rq *this_rq) |
| 4505 | { | 4582 | { |
| 4506 | unsigned long curr_jiffies = READ_ONCE(jiffies); | ||
| 4507 | unsigned long load = weighted_cpuload(cpu_of(this_rq)); | ||
| 4508 | unsigned long pending_updates; | ||
| 4509 | |||
| 4510 | /* | 4583 | /* |
| 4511 | * bail if there's load or we're actually up-to-date. | 4584 | * bail if there's load or we're actually up-to-date. |
| 4512 | */ | 4585 | */ |
| 4513 | if (load || curr_jiffies == this_rq->last_load_update_tick) | 4586 | if (weighted_cpuload(cpu_of(this_rq))) |
| 4514 | return; | 4587 | return; |
| 4515 | 4588 | ||
| 4516 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | 4589 | __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0); |
| 4517 | this_rq->last_load_update_tick = curr_jiffies; | ||
| 4518 | |||
| 4519 | __update_cpu_load(this_rq, load, pending_updates, 0); | ||
| 4520 | } | 4590 | } |
| 4521 | 4591 | ||
| 4522 | /* | 4592 | /* |
| @@ -4527,22 +4597,12 @@ void update_cpu_load_nohz(int active) | |||
| 4527 | struct rq *this_rq = this_rq(); | 4597 | struct rq *this_rq = this_rq(); |
| 4528 | unsigned long curr_jiffies = READ_ONCE(jiffies); | 4598 | unsigned long curr_jiffies = READ_ONCE(jiffies); |
| 4529 | unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; | 4599 | unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; |
| 4530 | unsigned long pending_updates; | ||
| 4531 | 4600 | ||
| 4532 | if (curr_jiffies == this_rq->last_load_update_tick) | 4601 | if (curr_jiffies == this_rq->last_load_update_tick) |
| 4533 | return; | 4602 | return; |
| 4534 | 4603 | ||
| 4535 | raw_spin_lock(&this_rq->lock); | 4604 | raw_spin_lock(&this_rq->lock); |
| 4536 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | 4605 | __update_cpu_load_nohz(this_rq, curr_jiffies, load, active); |
| 4537 | if (pending_updates) { | ||
| 4538 | this_rq->last_load_update_tick = curr_jiffies; | ||
| 4539 | /* | ||
| 4540 | * In the regular NOHZ case, we were idle, this means load 0. | ||
| 4541 | * In the NOHZ_FULL case, we were non-idle, we should consider | ||
| 4542 | * its weighted load. | ||
| 4543 | */ | ||
| 4544 | __update_cpu_load(this_rq, load, pending_updates, active); | ||
| 4545 | } | ||
| 4546 | raw_spin_unlock(&this_rq->lock); | 4606 | raw_spin_unlock(&this_rq->lock); |
| 4547 | } | 4607 | } |
| 4548 | #endif /* CONFIG_NO_HZ */ | 4608 | #endif /* CONFIG_NO_HZ */ |
| @@ -4554,7 +4614,7 @@ void update_cpu_load_active(struct rq *this_rq) | |||
| 4554 | { | 4614 | { |
| 4555 | unsigned long load = weighted_cpuload(cpu_of(this_rq)); | 4615 | unsigned long load = weighted_cpuload(cpu_of(this_rq)); |
| 4556 | /* | 4616 | /* |
| 4557 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | 4617 | * See the mess around update_cpu_load_idle() / update_cpu_load_nohz(). |
| 4558 | */ | 4618 | */ |
| 4559 | this_rq->last_load_update_tick = jiffies; | 4619 | this_rq->last_load_update_tick = jiffies; |
| 4560 | __update_cpu_load(this_rq, load, 1, 1); | 4620 | __update_cpu_load(this_rq, load, 1, 1); |
| @@ -7848,7 +7908,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | |||
| 7848 | if (time_after_eq(jiffies, rq->next_balance)) { | 7908 | if (time_after_eq(jiffies, rq->next_balance)) { |
| 7849 | raw_spin_lock_irq(&rq->lock); | 7909 | raw_spin_lock_irq(&rq->lock); |
| 7850 | update_rq_clock(rq); | 7910 | update_rq_clock(rq); |
| 7851 | update_idle_cpu_load(rq); | 7911 | update_cpu_load_idle(rq); |
| 7852 | raw_spin_unlock_irq(&rq->lock); | 7912 | raw_spin_unlock_irq(&rq->lock); |
| 7853 | rebalance_domains(rq, CPU_IDLE); | 7913 | rebalance_domains(rq, CPU_IDLE); |
| 7854 | } | 7914 | } |
| @@ -8234,11 +8294,8 @@ void free_fair_sched_group(struct task_group *tg) | |||
| 8234 | for_each_possible_cpu(i) { | 8294 | for_each_possible_cpu(i) { |
| 8235 | if (tg->cfs_rq) | 8295 | if (tg->cfs_rq) |
| 8236 | kfree(tg->cfs_rq[i]); | 8296 | kfree(tg->cfs_rq[i]); |
| 8237 | if (tg->se) { | 8297 | if (tg->se) |
| 8238 | if (tg->se[i]) | ||
| 8239 | remove_entity_load_avg(tg->se[i]); | ||
| 8240 | kfree(tg->se[i]); | 8298 | kfree(tg->se[i]); |
| 8241 | } | ||
| 8242 | } | 8299 | } |
| 8243 | 8300 | ||
| 8244 | kfree(tg->cfs_rq); | 8301 | kfree(tg->cfs_rq); |
| @@ -8286,21 +8343,29 @@ err: | |||
| 8286 | return 0; | 8343 | return 0; |
| 8287 | } | 8344 | } |
| 8288 | 8345 | ||
| 8289 | void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8346 | void unregister_fair_sched_group(struct task_group *tg) |
| 8290 | { | 8347 | { |
| 8291 | struct rq *rq = cpu_rq(cpu); | ||
| 8292 | unsigned long flags; | 8348 | unsigned long flags; |
| 8349 | struct rq *rq; | ||
| 8350 | int cpu; | ||
| 8293 | 8351 | ||
| 8294 | /* | 8352 | for_each_possible_cpu(cpu) { |
| 8295 | * Only empty task groups can be destroyed; so we can speculatively | 8353 | if (tg->se[cpu]) |
| 8296 | * check on_list without danger of it being re-added. | 8354 | remove_entity_load_avg(tg->se[cpu]); |
| 8297 | */ | ||
| 8298 | if (!tg->cfs_rq[cpu]->on_list) | ||
| 8299 | return; | ||
| 8300 | 8355 | ||
| 8301 | raw_spin_lock_irqsave(&rq->lock, flags); | 8356 | /* |
| 8302 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | 8357 | * Only empty task groups can be destroyed; so we can speculatively |
| 8303 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 8358 | * check on_list without danger of it being re-added. |
| 8359 | */ | ||
| 8360 | if (!tg->cfs_rq[cpu]->on_list) | ||
| 8361 | continue; | ||
| 8362 | |||
| 8363 | rq = cpu_rq(cpu); | ||
| 8364 | |||
| 8365 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
| 8366 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
| 8367 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
| 8368 | } | ||
| 8304 | } | 8369 | } |
| 8305 | 8370 | ||
| 8306 | void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 8371 | void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
| @@ -8382,7 +8447,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 8382 | return 1; | 8447 | return 1; |
| 8383 | } | 8448 | } |
| 8384 | 8449 | ||
| 8385 | void unregister_fair_sched_group(struct task_group *tg, int cpu) { } | 8450 | void unregister_fair_sched_group(struct task_group *tg) { } |
| 8386 | 8451 | ||
| 8387 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8452 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 8388 | 8453 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86abe0ea1..a774b4dbf291 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
| 58 | raw_spin_lock(&rt_b->rt_runtime_lock); | 58 | raw_spin_lock(&rt_b->rt_runtime_lock); |
| 59 | if (!rt_b->rt_period_active) { | 59 | if (!rt_b->rt_period_active) { |
| 60 | rt_b->rt_period_active = 1; | 60 | rt_b->rt_period_active = 1; |
| 61 | hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); | 61 | /* |
| 62 | * SCHED_DEADLINE updates the bandwidth, as a run away | ||
| 63 | * RT task with a DL task could hog a CPU. But DL does | ||
| 64 | * not reset the period. If a deadline task was running | ||
| 65 | * without an RT task running, it can cause RT tasks to | ||
| 66 | * throttle when they start up. Kick the timer right away | ||
| 67 | * to update the period. | ||
| 68 | */ | ||
| 69 | hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); | ||
| 62 | hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); | 70 | hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); |
| 63 | } | 71 | } |
| 64 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 72 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
| @@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq); | |||
| 436 | 444 | ||
| 437 | static inline int on_rt_rq(struct sched_rt_entity *rt_se) | 445 | static inline int on_rt_rq(struct sched_rt_entity *rt_se) |
| 438 | { | 446 | { |
| 439 | return !list_empty(&rt_se->run_list); | 447 | return rt_se->on_rq; |
| 440 | } | 448 | } |
| 441 | 449 | ||
| 442 | #ifdef CONFIG_RT_GROUP_SCHED | 450 | #ifdef CONFIG_RT_GROUP_SCHED |
| @@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | |||
| 482 | return rt_se->my_q; | 490 | return rt_se->my_q; |
| 483 | } | 491 | } |
| 484 | 492 | ||
| 485 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); | 493 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); |
| 486 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | 494 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); |
| 487 | 495 | ||
| 488 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | 496 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) |
| 489 | { | 497 | { |
| @@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | |||
| 499 | if (!rt_se) | 507 | if (!rt_se) |
| 500 | enqueue_top_rt_rq(rt_rq); | 508 | enqueue_top_rt_rq(rt_rq); |
| 501 | else if (!on_rt_rq(rt_se)) | 509 | else if (!on_rt_rq(rt_se)) |
| 502 | enqueue_rt_entity(rt_se, false); | 510 | enqueue_rt_entity(rt_se, 0); |
| 503 | 511 | ||
| 504 | if (rt_rq->highest_prio.curr < curr->prio) | 512 | if (rt_rq->highest_prio.curr < curr->prio) |
| 505 | resched_curr(rq); | 513 | resched_curr(rq); |
| @@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | |||
| 516 | if (!rt_se) | 524 | if (!rt_se) |
| 517 | dequeue_top_rt_rq(rt_rq); | 525 | dequeue_top_rt_rq(rt_rq); |
| 518 | else if (on_rt_rq(rt_se)) | 526 | else if (on_rt_rq(rt_se)) |
| 519 | dequeue_rt_entity(rt_se); | 527 | dequeue_rt_entity(rt_se, 0); |
| 520 | } | 528 | } |
| 521 | 529 | ||
| 522 | static inline int rt_rq_throttled(struct rt_rq *rt_rq) | 530 | static inline int rt_rq_throttled(struct rt_rq *rt_rq) |
| @@ -1166,7 +1174,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
| 1166 | dec_rt_group(rt_se, rt_rq); | 1174 | dec_rt_group(rt_se, rt_rq); |
| 1167 | } | 1175 | } |
| 1168 | 1176 | ||
| 1169 | static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) | 1177 | /* |
| 1178 | * Change rt_se->run_list location unless SAVE && !MOVE | ||
| 1179 | * | ||
| 1180 | * assumes ENQUEUE/DEQUEUE flags match | ||
| 1181 | */ | ||
| 1182 | static inline bool move_entity(unsigned int flags) | ||
| 1183 | { | ||
| 1184 | if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) | ||
| 1185 | return false; | ||
| 1186 | |||
| 1187 | return true; | ||
| 1188 | } | ||
| 1189 | |||
| 1190 | static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) | ||
| 1191 | { | ||
| 1192 | list_del_init(&rt_se->run_list); | ||
| 1193 | |||
| 1194 | if (list_empty(array->queue + rt_se_prio(rt_se))) | ||
| 1195 | __clear_bit(rt_se_prio(rt_se), array->bitmap); | ||
| 1196 | |||
| 1197 | rt_se->on_list = 0; | ||
| 1198 | } | ||
| 1199 | |||
| 1200 | static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) | ||
| 1170 | { | 1201 | { |
| 1171 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | 1202 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
| 1172 | struct rt_prio_array *array = &rt_rq->active; | 1203 | struct rt_prio_array *array = &rt_rq->active; |
| @@ -1179,26 +1210,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) | |||
| 1179 | * get throttled and the current group doesn't have any other | 1210 | * get throttled and the current group doesn't have any other |
| 1180 | * active members. | 1211 | * active members. |
| 1181 | */ | 1212 | */ |
| 1182 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) | 1213 | if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { |
| 1214 | if (rt_se->on_list) | ||
| 1215 | __delist_rt_entity(rt_se, array); | ||
| 1183 | return; | 1216 | return; |
| 1217 | } | ||
| 1184 | 1218 | ||
| 1185 | if (head) | 1219 | if (move_entity(flags)) { |
| 1186 | list_add(&rt_se->run_list, queue); | 1220 | WARN_ON_ONCE(rt_se->on_list); |
| 1187 | else | 1221 | if (flags & ENQUEUE_HEAD) |
| 1188 | list_add_tail(&rt_se->run_list, queue); | 1222 | list_add(&rt_se->run_list, queue); |
| 1189 | __set_bit(rt_se_prio(rt_se), array->bitmap); | 1223 | else |
| 1224 | list_add_tail(&rt_se->run_list, queue); | ||
| 1225 | |||
| 1226 | __set_bit(rt_se_prio(rt_se), array->bitmap); | ||
| 1227 | rt_se->on_list = 1; | ||
| 1228 | } | ||
| 1229 | rt_se->on_rq = 1; | ||
| 1190 | 1230 | ||
| 1191 | inc_rt_tasks(rt_se, rt_rq); | 1231 | inc_rt_tasks(rt_se, rt_rq); |
| 1192 | } | 1232 | } |
| 1193 | 1233 | ||
| 1194 | static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) | 1234 | static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) |
| 1195 | { | 1235 | { |
| 1196 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | 1236 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
| 1197 | struct rt_prio_array *array = &rt_rq->active; | 1237 | struct rt_prio_array *array = &rt_rq->active; |
| 1198 | 1238 | ||
| 1199 | list_del_init(&rt_se->run_list); | 1239 | if (move_entity(flags)) { |
| 1200 | if (list_empty(array->queue + rt_se_prio(rt_se))) | 1240 | WARN_ON_ONCE(!rt_se->on_list); |
| 1201 | __clear_bit(rt_se_prio(rt_se), array->bitmap); | 1241 | __delist_rt_entity(rt_se, array); |
| 1242 | } | ||
| 1243 | rt_se->on_rq = 0; | ||
| 1202 | 1244 | ||
| 1203 | dec_rt_tasks(rt_se, rt_rq); | 1245 | dec_rt_tasks(rt_se, rt_rq); |
| 1204 | } | 1246 | } |
| @@ -1207,7 +1249,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
| 1207 | * Because the prio of an upper entry depends on the lower | 1249 | * Because the prio of an upper entry depends on the lower |
| 1208 | * entries, we must remove entries top - down. | 1250 | * entries, we must remove entries top - down. |
| 1209 | */ | 1251 | */ |
| 1210 | static void dequeue_rt_stack(struct sched_rt_entity *rt_se) | 1252 | static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) |
| 1211 | { | 1253 | { |
| 1212 | struct sched_rt_entity *back = NULL; | 1254 | struct sched_rt_entity *back = NULL; |
| 1213 | 1255 | ||
| @@ -1220,31 +1262,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) | |||
| 1220 | 1262 | ||
| 1221 | for (rt_se = back; rt_se; rt_se = rt_se->back) { | 1263 | for (rt_se = back; rt_se; rt_se = rt_se->back) { |
| 1222 | if (on_rt_rq(rt_se)) | 1264 | if (on_rt_rq(rt_se)) |
| 1223 | __dequeue_rt_entity(rt_se); | 1265 | __dequeue_rt_entity(rt_se, flags); |
| 1224 | } | 1266 | } |
| 1225 | } | 1267 | } |
| 1226 | 1268 | ||
| 1227 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) | 1269 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) |
| 1228 | { | 1270 | { |
| 1229 | struct rq *rq = rq_of_rt_se(rt_se); | 1271 | struct rq *rq = rq_of_rt_se(rt_se); |
| 1230 | 1272 | ||
| 1231 | dequeue_rt_stack(rt_se); | 1273 | dequeue_rt_stack(rt_se, flags); |
| 1232 | for_each_sched_rt_entity(rt_se) | 1274 | for_each_sched_rt_entity(rt_se) |
| 1233 | __enqueue_rt_entity(rt_se, head); | 1275 | __enqueue_rt_entity(rt_se, flags); |
| 1234 | enqueue_top_rt_rq(&rq->rt); | 1276 | enqueue_top_rt_rq(&rq->rt); |
| 1235 | } | 1277 | } |
| 1236 | 1278 | ||
| 1237 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | 1279 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) |
| 1238 | { | 1280 | { |
| 1239 | struct rq *rq = rq_of_rt_se(rt_se); | 1281 | struct rq *rq = rq_of_rt_se(rt_se); |
| 1240 | 1282 | ||
| 1241 | dequeue_rt_stack(rt_se); | 1283 | dequeue_rt_stack(rt_se, flags); |
| 1242 | 1284 | ||
| 1243 | for_each_sched_rt_entity(rt_se) { | 1285 | for_each_sched_rt_entity(rt_se) { |
| 1244 | struct rt_rq *rt_rq = group_rt_rq(rt_se); | 1286 | struct rt_rq *rt_rq = group_rt_rq(rt_se); |
| 1245 | 1287 | ||
| 1246 | if (rt_rq && rt_rq->rt_nr_running) | 1288 | if (rt_rq && rt_rq->rt_nr_running) |
| 1247 | __enqueue_rt_entity(rt_se, false); | 1289 | __enqueue_rt_entity(rt_se, flags); |
| 1248 | } | 1290 | } |
| 1249 | enqueue_top_rt_rq(&rq->rt); | 1291 | enqueue_top_rt_rq(&rq->rt); |
| 1250 | } | 1292 | } |
| @@ -1260,7 +1302,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
| 1260 | if (flags & ENQUEUE_WAKEUP) | 1302 | if (flags & ENQUEUE_WAKEUP) |
| 1261 | rt_se->timeout = 0; | 1303 | rt_se->timeout = 0; |
| 1262 | 1304 | ||
| 1263 | enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); | 1305 | enqueue_rt_entity(rt_se, flags); |
| 1264 | 1306 | ||
| 1265 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) | 1307 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) |
| 1266 | enqueue_pushable_task(rq, p); | 1308 | enqueue_pushable_task(rq, p); |
| @@ -1271,7 +1313,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
| 1271 | struct sched_rt_entity *rt_se = &p->rt; | 1313 | struct sched_rt_entity *rt_se = &p->rt; |
| 1272 | 1314 | ||
| 1273 | update_curr_rt(rq); | 1315 | update_curr_rt(rq); |
| 1274 | dequeue_rt_entity(rt_se); | 1316 | dequeue_rt_entity(rt_se, flags); |
| 1275 | 1317 | ||
| 1276 | dequeue_pushable_task(rq, p); | 1318 | dequeue_pushable_task(rq, p); |
| 1277 | } | 1319 | } |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 10f16374df7f..ef5875fff5b7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -3,6 +3,7 @@ | |||
| 3 | #include <linux/sched/sysctl.h> | 3 | #include <linux/sched/sysctl.h> |
| 4 | #include <linux/sched/rt.h> | 4 | #include <linux/sched/rt.h> |
| 5 | #include <linux/sched/deadline.h> | 5 | #include <linux/sched/deadline.h> |
| 6 | #include <linux/binfmts.h> | ||
| 6 | #include <linux/mutex.h> | 7 | #include <linux/mutex.h> |
| 7 | #include <linux/spinlock.h> | 8 | #include <linux/spinlock.h> |
| 8 | #include <linux/stop_machine.h> | 9 | #include <linux/stop_machine.h> |
| @@ -313,12 +314,11 @@ extern int tg_nop(struct task_group *tg, void *data); | |||
| 313 | 314 | ||
| 314 | extern void free_fair_sched_group(struct task_group *tg); | 315 | extern void free_fair_sched_group(struct task_group *tg); |
| 315 | extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); | 316 | extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); |
| 316 | extern void unregister_fair_sched_group(struct task_group *tg, int cpu); | 317 | extern void unregister_fair_sched_group(struct task_group *tg); |
| 317 | extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 318 | extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
| 318 | struct sched_entity *se, int cpu, | 319 | struct sched_entity *se, int cpu, |
| 319 | struct sched_entity *parent); | 320 | struct sched_entity *parent); |
| 320 | extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); | 321 | extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); |
| 321 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); | ||
| 322 | 322 | ||
| 323 | extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); | 323 | extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); |
| 324 | extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); | 324 | extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); |
| @@ -909,6 +909,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group) | |||
| 909 | 909 | ||
| 910 | extern int group_balance_cpu(struct sched_group *sg); | 910 | extern int group_balance_cpu(struct sched_group *sg); |
| 911 | 911 | ||
| 912 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | ||
| 913 | void register_sched_domain_sysctl(void); | ||
| 914 | void unregister_sched_domain_sysctl(void); | ||
| 915 | #else | ||
| 916 | static inline void register_sched_domain_sysctl(void) | ||
| 917 | { | ||
| 918 | } | ||
| 919 | static inline void unregister_sched_domain_sysctl(void) | ||
| 920 | { | ||
| 921 | } | ||
| 922 | #endif | ||
| 923 | |||
| 912 | #else | 924 | #else |
| 913 | 925 | ||
| 914 | static inline void sched_ttwu_pending(void) { } | 926 | static inline void sched_ttwu_pending(void) { } |
| @@ -1022,6 +1034,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; | |||
| 1022 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ | 1034 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ |
| 1023 | 1035 | ||
| 1024 | extern struct static_key_false sched_numa_balancing; | 1036 | extern struct static_key_false sched_numa_balancing; |
| 1037 | extern struct static_key_false sched_schedstats; | ||
| 1025 | 1038 | ||
| 1026 | static inline u64 global_rt_period(void) | 1039 | static inline u64 global_rt_period(void) |
| 1027 | { | 1040 | { |
| @@ -1130,18 +1143,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
| 1130 | extern const int sched_prio_to_weight[40]; | 1143 | extern const int sched_prio_to_weight[40]; |
| 1131 | extern const u32 sched_prio_to_wmult[40]; | 1144 | extern const u32 sched_prio_to_wmult[40]; |
| 1132 | 1145 | ||
| 1146 | /* | ||
| 1147 | * {de,en}queue flags: | ||
| 1148 | * | ||
| 1149 | * DEQUEUE_SLEEP - task is no longer runnable | ||
| 1150 | * ENQUEUE_WAKEUP - task just became runnable | ||
| 1151 | * | ||
| 1152 | * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks | ||
| 1153 | * are in a known state which allows modification. Such pairs | ||
| 1154 | * should preserve as much state as possible. | ||
| 1155 | * | ||
| 1156 | * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location | ||
| 1157 | * in the runqueue. | ||
| 1158 | * | ||
| 1159 | * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) | ||
| 1160 | * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) | ||
| 1161 | * ENQUEUE_WAKING - sched_class::task_waking was called | ||
| 1162 | * | ||
| 1163 | */ | ||
| 1164 | |||
| 1165 | #define DEQUEUE_SLEEP 0x01 | ||
| 1166 | #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ | ||
| 1167 | #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ | ||
| 1168 | |||
| 1133 | #define ENQUEUE_WAKEUP 0x01 | 1169 | #define ENQUEUE_WAKEUP 0x01 |
| 1134 | #define ENQUEUE_HEAD 0x02 | 1170 | #define ENQUEUE_RESTORE 0x02 |
| 1171 | #define ENQUEUE_MOVE 0x04 | ||
| 1172 | |||
| 1173 | #define ENQUEUE_HEAD 0x08 | ||
| 1174 | #define ENQUEUE_REPLENISH 0x10 | ||
| 1135 | #ifdef CONFIG_SMP | 1175 | #ifdef CONFIG_SMP |
| 1136 | #define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ | 1176 | #define ENQUEUE_WAKING 0x20 |
| 1137 | #else | 1177 | #else |
| 1138 | #define ENQUEUE_WAKING 0x00 | 1178 | #define ENQUEUE_WAKING 0x00 |
| 1139 | #endif | 1179 | #endif |
| 1140 | #define ENQUEUE_REPLENISH 0x08 | ||
| 1141 | #define ENQUEUE_RESTORE 0x10 | ||
| 1142 | |||
| 1143 | #define DEQUEUE_SLEEP 0x01 | ||
| 1144 | #define DEQUEUE_SAVE 0x02 | ||
| 1145 | 1180 | ||
| 1146 | #define RETRY_TASK ((void *)-1UL) | 1181 | #define RETRY_TASK ((void *)-1UL) |
| 1147 | 1182 | ||
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index b0fbc7632de5..70b3b6a20fb0 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
| @@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | |||
| 29 | if (rq) | 29 | if (rq) |
| 30 | rq->rq_sched_info.run_delay += delta; | 30 | rq->rq_sched_info.run_delay += delta; |
| 31 | } | 31 | } |
| 32 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) | 32 | # define schedstat_enabled() static_branch_unlikely(&sched_schedstats) |
| 33 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) | 33 | # define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) |
| 34 | # define schedstat_set(var, val) do { var = (val); } while (0) | 34 | # define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) |
| 35 | # define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) | ||
| 35 | #else /* !CONFIG_SCHEDSTATS */ | 36 | #else /* !CONFIG_SCHEDSTATS */ |
| 36 | static inline void | 37 | static inline void |
| 37 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | 38 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) |
| @@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | |||
| 42 | static inline void | 43 | static inline void |
| 43 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) | 44 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) |
| 44 | {} | 45 | {} |
| 46 | # define schedstat_enabled() 0 | ||
| 45 | # define schedstat_inc(rq, field) do { } while (0) | 47 | # define schedstat_inc(rq, field) do { } while (0) |
| 46 | # define schedstat_add(rq, field, amt) do { } while (0) | 48 | # define schedstat_add(rq, field, amt) do { } while (0) |
| 47 | # define schedstat_set(var, val) do { } while (0) | 49 | # define schedstat_set(var, val) do { } while (0) |
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c new file mode 100644 index 000000000000..82f0dff90030 --- /dev/null +++ b/kernel/sched/swait.c | |||
| @@ -0,0 +1,123 @@ | |||
| 1 | #include <linux/sched.h> | ||
| 2 | #include <linux/swait.h> | ||
| 3 | |||
| 4 | void __init_swait_queue_head(struct swait_queue_head *q, const char *name, | ||
| 5 | struct lock_class_key *key) | ||
| 6 | { | ||
| 7 | raw_spin_lock_init(&q->lock); | ||
| 8 | lockdep_set_class_and_name(&q->lock, key, name); | ||
| 9 | INIT_LIST_HEAD(&q->task_list); | ||
| 10 | } | ||
| 11 | EXPORT_SYMBOL(__init_swait_queue_head); | ||
| 12 | |||
| 13 | /* | ||
| 14 | * The thing about the wake_up_state() return value; I think we can ignore it. | ||
| 15 | * | ||
| 16 | * If for some reason it would return 0, that means the previously waiting | ||
| 17 | * task is already running, so it will observe condition true (or has already). | ||
| 18 | */ | ||
| 19 | void swake_up_locked(struct swait_queue_head *q) | ||
| 20 | { | ||
| 21 | struct swait_queue *curr; | ||
| 22 | |||
| 23 | if (list_empty(&q->task_list)) | ||
| 24 | return; | ||
| 25 | |||
| 26 | curr = list_first_entry(&q->task_list, typeof(*curr), task_list); | ||
| 27 | wake_up_process(curr->task); | ||
| 28 | list_del_init(&curr->task_list); | ||
| 29 | } | ||
| 30 | EXPORT_SYMBOL(swake_up_locked); | ||
| 31 | |||
| 32 | void swake_up(struct swait_queue_head *q) | ||
| 33 | { | ||
| 34 | unsigned long flags; | ||
| 35 | |||
| 36 | if (!swait_active(q)) | ||
| 37 | return; | ||
| 38 | |||
| 39 | raw_spin_lock_irqsave(&q->lock, flags); | ||
| 40 | swake_up_locked(q); | ||
| 41 | raw_spin_unlock_irqrestore(&q->lock, flags); | ||
| 42 | } | ||
| 43 | EXPORT_SYMBOL(swake_up); | ||
| 44 | |||
| 45 | /* | ||
| 46 | * Does not allow usage from IRQ disabled, since we must be able to | ||
| 47 | * release IRQs to guarantee bounded hold time. | ||
| 48 | */ | ||
| 49 | void swake_up_all(struct swait_queue_head *q) | ||
| 50 | { | ||
| 51 | struct swait_queue *curr; | ||
| 52 | LIST_HEAD(tmp); | ||
| 53 | |||
| 54 | if (!swait_active(q)) | ||
| 55 | return; | ||
| 56 | |||
| 57 | raw_spin_lock_irq(&q->lock); | ||
| 58 | list_splice_init(&q->task_list, &tmp); | ||
| 59 | while (!list_empty(&tmp)) { | ||
| 60 | curr = list_first_entry(&tmp, typeof(*curr), task_list); | ||
| 61 | |||
| 62 | wake_up_state(curr->task, TASK_NORMAL); | ||
| 63 | list_del_init(&curr->task_list); | ||
| 64 | |||
| 65 | if (list_empty(&tmp)) | ||
| 66 | break; | ||
| 67 | |||
| 68 | raw_spin_unlock_irq(&q->lock); | ||
| 69 | raw_spin_lock_irq(&q->lock); | ||
| 70 | } | ||
| 71 | raw_spin_unlock_irq(&q->lock); | ||
| 72 | } | ||
| 73 | EXPORT_SYMBOL(swake_up_all); | ||
| 74 | |||
| 75 | void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) | ||
| 76 | { | ||
| 77 | wait->task = current; | ||
| 78 | if (list_empty(&wait->task_list)) | ||
| 79 | list_add(&wait->task_list, &q->task_list); | ||
| 80 | } | ||
| 81 | |||
| 82 | void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) | ||
| 83 | { | ||
| 84 | unsigned long flags; | ||
| 85 | |||
| 86 | raw_spin_lock_irqsave(&q->lock, flags); | ||
| 87 | __prepare_to_swait(q, wait); | ||
| 88 | set_current_state(state); | ||
| 89 | raw_spin_unlock_irqrestore(&q->lock, flags); | ||
| 90 | } | ||
| 91 | EXPORT_SYMBOL(prepare_to_swait); | ||
| 92 | |||
| 93 | long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) | ||
| 94 | { | ||
| 95 | if (signal_pending_state(state, current)) | ||
| 96 | return -ERESTARTSYS; | ||
| 97 | |||
| 98 | prepare_to_swait(q, wait, state); | ||
| 99 | |||
| 100 | return 0; | ||
| 101 | } | ||
| 102 | EXPORT_SYMBOL(prepare_to_swait_event); | ||
| 103 | |||
| 104 | void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait) | ||
| 105 | { | ||
| 106 | __set_current_state(TASK_RUNNING); | ||
| 107 | if (!list_empty(&wait->task_list)) | ||
| 108 | list_del_init(&wait->task_list); | ||
| 109 | } | ||
| 110 | |||
| 111 | void finish_swait(struct swait_queue_head *q, struct swait_queue *wait) | ||
| 112 | { | ||
| 113 | unsigned long flags; | ||
| 114 | |||
| 115 | __set_current_state(TASK_RUNNING); | ||
| 116 | |||
| 117 | if (!list_empty_careful(&wait->task_list)) { | ||
| 118 | raw_spin_lock_irqsave(&q->lock, flags); | ||
| 119 | list_del_init(&wait->task_list); | ||
| 120 | raw_spin_unlock_irqrestore(&q->lock, flags); | ||
| 121 | } | ||
| 122 | } | ||
| 123 | EXPORT_SYMBOL(finish_swait); | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c index 479e4436f787..8aae49dd7da8 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) | |||
| 116 | 116 | ||
| 117 | if (preempt_count() == cnt) { | 117 | if (preempt_count() == cnt) { |
| 118 | #ifdef CONFIG_DEBUG_PREEMPT | 118 | #ifdef CONFIG_DEBUG_PREEMPT |
| 119 | current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1); | 119 | current->preempt_disable_ip = get_lock_parent_ip(); |
| 120 | #endif | 120 | #endif |
| 121 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 121 | trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip()); |
| 122 | } | 122 | } |
| 123 | } | 123 | } |
| 124 | EXPORT_SYMBOL(__local_bh_disable_ip); | 124 | EXPORT_SYMBOL(__local_bh_disable_ip); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 97715fd9e790..f5102fabef7f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = { | |||
| 350 | .mode = 0644, | 350 | .mode = 0644, |
| 351 | .proc_handler = proc_dointvec, | 351 | .proc_handler = proc_dointvec, |
| 352 | }, | 352 | }, |
| 353 | #ifdef CONFIG_SCHEDSTATS | ||
| 354 | { | ||
| 355 | .procname = "sched_schedstats", | ||
| 356 | .data = NULL, | ||
| 357 | .maxlen = sizeof(unsigned int), | ||
| 358 | .mode = 0644, | ||
| 359 | .proc_handler = sysctl_schedstats, | ||
| 360 | .extra1 = &zero, | ||
| 361 | .extra2 = &one, | ||
| 362 | }, | ||
| 363 | #endif /* CONFIG_SCHEDSTATS */ | ||
| 353 | #endif /* CONFIG_SMP */ | 364 | #endif /* CONFIG_SMP */ |
| 354 | #ifdef CONFIG_NUMA_BALANCING | 365 | #ifdef CONFIG_NUMA_BALANCING |
| 355 | { | 366 | { |
| @@ -505,7 +516,7 @@ static struct ctl_table kern_table[] = { | |||
| 505 | .data = &latencytop_enabled, | 516 | .data = &latencytop_enabled, |
| 506 | .maxlen = sizeof(int), | 517 | .maxlen = sizeof(int), |
| 507 | .mode = 0644, | 518 | .mode = 0644, |
| 508 | .proc_handler = proc_dointvec, | 519 | .proc_handler = sysctl_latencytop, |
| 509 | }, | 520 | }, |
| 510 | #endif | 521 | #endif |
| 511 | #ifdef CONFIG_BLK_DEV_INITRD | 522 | #ifdef CONFIG_BLK_DEV_INITRD |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 975cb49e32bf..f8e26ab963ed 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
| @@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) | |||
| 93 | { | 93 | { |
| 94 | struct mm_struct *mm; | 94 | struct mm_struct *mm; |
| 95 | 95 | ||
| 96 | /* convert pages-usec to Mbyte-usec */ | 96 | /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */ |
| 97 | stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB; | 97 | stats->coremem = p->acct_rss_mem1 * PAGE_SIZE; |
| 98 | stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB; | 98 | do_div(stats->coremem, 1000 * KB); |
| 99 | stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE; | ||
| 100 | do_div(stats->virtmem, 1000 * KB); | ||
| 99 | mm = get_task_mm(p); | 101 | mm = get_task_mm(p); |
| 100 | if (mm) { | 102 | if (mm) { |
| 101 | /* adjust to KB unit */ | 103 | /* adjust to KB unit */ |
| @@ -123,27 +125,28 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) | |||
| 123 | static void __acct_update_integrals(struct task_struct *tsk, | 125 | static void __acct_update_integrals(struct task_struct *tsk, |
| 124 | cputime_t utime, cputime_t stime) | 126 | cputime_t utime, cputime_t stime) |
| 125 | { | 127 | { |
| 126 | if (likely(tsk->mm)) { | 128 | cputime_t time, dtime; |
| 127 | cputime_t time, dtime; | 129 | u64 delta; |
| 128 | struct timeval value; | 130 | |
| 129 | unsigned long flags; | 131 | if (!likely(tsk->mm)) |
| 130 | u64 delta; | 132 | return; |
| 131 | 133 | ||
| 132 | local_irq_save(flags); | 134 | time = stime + utime; |
| 133 | time = stime + utime; | 135 | dtime = time - tsk->acct_timexpd; |
| 134 | dtime = time - tsk->acct_timexpd; | 136 | /* Avoid division: cputime_t is often in nanoseconds already. */ |
| 135 | jiffies_to_timeval(cputime_to_jiffies(dtime), &value); | 137 | delta = cputime_to_nsecs(dtime); |
| 136 | delta = value.tv_sec; | 138 | |
| 137 | delta = delta * USEC_PER_SEC + value.tv_usec; | 139 | if (delta < TICK_NSEC) |
| 138 | 140 | return; | |
| 139 | if (delta == 0) | 141 | |
| 140 | goto out; | 142 | tsk->acct_timexpd = time; |
| 141 | tsk->acct_timexpd = time; | 143 | /* |
| 142 | tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); | 144 | * Divide by 1024 to avoid overflow, and to avoid division. |
| 143 | tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; | 145 | * The final unit reported to userspace is Mbyte-usecs, |
| 144 | out: | 146 | * the rest of the math is done in xacct_add_tsk. |
| 145 | local_irq_restore(flags); | 147 | */ |
| 146 | } | 148 | tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; |
| 149 | tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; | ||
| 147 | } | 150 | } |
| 148 | 151 | ||
| 149 | /** | 152 | /** |
| @@ -153,9 +156,12 @@ static void __acct_update_integrals(struct task_struct *tsk, | |||
| 153 | void acct_update_integrals(struct task_struct *tsk) | 156 | void acct_update_integrals(struct task_struct *tsk) |
| 154 | { | 157 | { |
| 155 | cputime_t utime, stime; | 158 | cputime_t utime, stime; |
| 159 | unsigned long flags; | ||
| 156 | 160 | ||
| 161 | local_irq_save(flags); | ||
| 157 | task_cputime(tsk, &utime, &stime); | 162 | task_cputime(tsk, &utime, &stime); |
| 158 | __acct_update_integrals(tsk, utime, stime); | 163 | __acct_update_integrals(tsk, utime, stime); |
| 164 | local_irq_restore(flags); | ||
| 159 | } | 165 | } |
| 160 | 166 | ||
| 161 | /** | 167 | /** |
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index db2dd3335c6a..65da997b430a 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c | |||
| @@ -97,8 +97,8 @@ static void async_pf_execute(struct work_struct *work) | |||
| 97 | * This memory barrier pairs with prepare_to_wait's set_current_state() | 97 | * This memory barrier pairs with prepare_to_wait's set_current_state() |
| 98 | */ | 98 | */ |
| 99 | smp_mb(); | 99 | smp_mb(); |
| 100 | if (waitqueue_active(&vcpu->wq)) | 100 | if (swait_active(&vcpu->wq)) |
| 101 | wake_up_interruptible(&vcpu->wq); | 101 | swake_up(&vcpu->wq); |
| 102 | 102 | ||
| 103 | mmput(mm); | 103 | mmput(mm); |
| 104 | kvm_put_kvm(vcpu->kvm); | 104 | kvm_put_kvm(vcpu->kvm); |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9102ae172d2a..5af50c3ddd53 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
| @@ -216,8 +216,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) | |||
| 216 | vcpu->kvm = kvm; | 216 | vcpu->kvm = kvm; |
| 217 | vcpu->vcpu_id = id; | 217 | vcpu->vcpu_id = id; |
| 218 | vcpu->pid = NULL; | 218 | vcpu->pid = NULL; |
| 219 | vcpu->halt_poll_ns = 0; | 219 | init_swait_queue_head(&vcpu->wq); |
| 220 | init_waitqueue_head(&vcpu->wq); | ||
| 221 | kvm_async_pf_vcpu_init(vcpu); | 220 | kvm_async_pf_vcpu_init(vcpu); |
| 222 | 221 | ||
| 223 | vcpu->pre_pcpu = -1; | 222 | vcpu->pre_pcpu = -1; |
| @@ -1993,7 +1992,7 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) | |||
| 1993 | void kvm_vcpu_block(struct kvm_vcpu *vcpu) | 1992 | void kvm_vcpu_block(struct kvm_vcpu *vcpu) |
| 1994 | { | 1993 | { |
| 1995 | ktime_t start, cur; | 1994 | ktime_t start, cur; |
| 1996 | DEFINE_WAIT(wait); | 1995 | DECLARE_SWAITQUEUE(wait); |
| 1997 | bool waited = false; | 1996 | bool waited = false; |
| 1998 | u64 block_ns; | 1997 | u64 block_ns; |
| 1999 | 1998 | ||
| @@ -2018,7 +2017,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) | |||
| 2018 | kvm_arch_vcpu_blocking(vcpu); | 2017 | kvm_arch_vcpu_blocking(vcpu); |
| 2019 | 2018 | ||
| 2020 | for (;;) { | 2019 | for (;;) { |
| 2021 | prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); | 2020 | prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); |
| 2022 | 2021 | ||
| 2023 | if (kvm_vcpu_check_block(vcpu) < 0) | 2022 | if (kvm_vcpu_check_block(vcpu) < 0) |
| 2024 | break; | 2023 | break; |
| @@ -2027,7 +2026,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) | |||
| 2027 | schedule(); | 2026 | schedule(); |
| 2028 | } | 2027 | } |
| 2029 | 2028 | ||
| 2030 | finish_wait(&vcpu->wq, &wait); | 2029 | finish_swait(&vcpu->wq, &wait); |
| 2031 | cur = ktime_get(); | 2030 | cur = ktime_get(); |
| 2032 | 2031 | ||
| 2033 | kvm_arch_vcpu_unblocking(vcpu); | 2032 | kvm_arch_vcpu_unblocking(vcpu); |
| @@ -2059,11 +2058,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) | |||
| 2059 | { | 2058 | { |
| 2060 | int me; | 2059 | int me; |
| 2061 | int cpu = vcpu->cpu; | 2060 | int cpu = vcpu->cpu; |
| 2062 | wait_queue_head_t *wqp; | 2061 | struct swait_queue_head *wqp; |
| 2063 | 2062 | ||
| 2064 | wqp = kvm_arch_vcpu_wq(vcpu); | 2063 | wqp = kvm_arch_vcpu_wq(vcpu); |
| 2065 | if (waitqueue_active(wqp)) { | 2064 | if (swait_active(wqp)) { |
| 2066 | wake_up_interruptible(wqp); | 2065 | swake_up(wqp); |
| 2067 | ++vcpu->stat.halt_wakeup; | 2066 | ++vcpu->stat.halt_wakeup; |
| 2068 | } | 2067 | } |
| 2069 | 2068 | ||
| @@ -2164,7 +2163,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) | |||
| 2164 | continue; | 2163 | continue; |
| 2165 | if (vcpu == me) | 2164 | if (vcpu == me) |
| 2166 | continue; | 2165 | continue; |
| 2167 | if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) | 2166 | if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) |
| 2168 | continue; | 2167 | continue; |
| 2169 | if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) | 2168 | if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) |
| 2170 | continue; | 2169 | continue; |
