aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/alpha/include/asm/thread_info.h4
-rw-r--r--arch/arm/kernel/topology.c26
-rw-r--r--arch/arm64/include/asm/thread_info.h2
-rw-r--r--arch/ia64/include/asm/thread_info.h3
-rw-r--r--arch/ia64/include/asm/topology.h24
-rw-r--r--arch/metag/include/asm/thread_info.h6
-rw-r--r--arch/powerpc/kernel/smp.c31
-rw-r--r--arch/s390/include/asm/topology.h13
-rw-r--r--arch/s390/kernel/topology.c20
-rw-r--r--arch/tile/include/asm/thread_info.h3
-rw-r--r--arch/tile/include/asm/topology.h33
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/kernel/apm_32.c11
-rw-r--r--drivers/block/loop.c2
-rw-r--r--drivers/block/nbd.c2
-rw-r--r--drivers/block/pktcdvd.c2
-rw-r--r--drivers/char/ipmi/ipmi_si_intf.c2
-rw-r--r--drivers/cpuidle/cpuidle.c55
-rw-r--r--drivers/cpuidle/governors/menu.c17
-rw-r--r--drivers/s390/crypto/ap_bus.c2
-rw-r--r--drivers/scsi/bnx2fc/bnx2fc_fcoe.c4
-rw-r--r--drivers/scsi/bnx2i/bnx2i_hwi.c2
-rw-r--r--drivers/scsi/fcoe/fcoe.c2
-rw-r--r--drivers/scsi/ibmvscsi/ibmvfc.c2
-rw-r--r--drivers/scsi/ibmvscsi/ibmvscsi.c2
-rw-r--r--drivers/scsi/lpfc/lpfc_hbadisc.c2
-rw-r--r--drivers/scsi/qla2xxx/qla_os.c2
-rw-r--r--drivers/staging/android/binder.c4
-rw-r--r--drivers/staging/lustre/lustre/llite/lloop.c2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--include/linux/cpuidle.h7
-rw-r--r--include/linux/sched.h104
-rw-r--r--include/linux/sched/prio.h16
-rw-r--r--include/linux/thread_info.h14
-rw-r--r--include/linux/topology.h128
-rw-r--r--kernel/locking/locktorture.c2
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/sched/core.c324
-rw-r--r--kernel/sched/deadline.c6
-rw-r--r--kernel/sched/fair.c226
-rw-r--r--kernel/sched/idle.c140
-rw-r--r--kernel/sched/rt.c119
-rw-r--r--kernel/sched/sched.h26
-rw-r--r--kernel/sched/stop_task.c4
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/workqueue.c6
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/memory.c3
48 files changed, 759 insertions, 662 deletions
diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h
index 3d6ce6d56fc9..48bbea6898b3 100644
--- a/arch/alpha/include/asm/thread_info.h
+++ b/arch/alpha/include/asm/thread_info.h
@@ -73,12 +73,14 @@ register struct thread_info *__current_thread_info __asm__("$8");
73#define TIF_SYSCALL_AUDIT 4 /* syscall audit active */ 73#define TIF_SYSCALL_AUDIT 4 /* syscall audit active */
74#define TIF_DIE_IF_KERNEL 9 /* dik recursion lock */ 74#define TIF_DIE_IF_KERNEL 9 /* dik recursion lock */
75#define TIF_MEMDIE 13 /* is terminating due to OOM killer */ 75#define TIF_MEMDIE 13 /* is terminating due to OOM killer */
76#define TIF_POLLING_NRFLAG 14 /* idle is polling for TIF_NEED_RESCHED */
76 77
77#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) 78#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
78#define _TIF_SIGPENDING (1<<TIF_SIGPENDING) 79#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
79#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) 80#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
80#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) 81#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
81#define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) 82#define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT)
83#define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
82 84
83/* Work to do on interrupt/exception return. */ 85/* Work to do on interrupt/exception return. */
84#define _TIF_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \ 86#define _TIF_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
@@ -92,8 +94,6 @@ register struct thread_info *__current_thread_info __asm__("$8");
92#define TS_UAC_NOFIX 0x0002 /* ! flags as they match */ 94#define TS_UAC_NOFIX 0x0002 /* ! flags as they match */
93#define TS_UAC_SIGBUS 0x0004 /* ! userspace part of 'osf_sysinfo' */ 95#define TS_UAC_SIGBUS 0x0004 /* ! userspace part of 'osf_sysinfo' */
94#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ 96#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
95#define TS_POLLING 0x0010 /* idle task polling need_resched,
96 skip sending interrupt */
97 97
98#ifndef __ASSEMBLY__ 98#ifndef __ASSEMBLY__
99#define HAVE_SET_RESTORE_SIGMASK 1 99#define HAVE_SET_RESTORE_SIGMASK 1
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 0bc94b1fd1ae..71e1fec6d31a 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -185,6 +185,15 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
185 return &cpu_topology[cpu].core_sibling; 185 return &cpu_topology[cpu].core_sibling;
186} 186}
187 187
188/*
189 * The current assumption is that we can power gate each core independently.
190 * This will be superseded by DT binding once available.
191 */
192const struct cpumask *cpu_corepower_mask(int cpu)
193{
194 return &cpu_topology[cpu].thread_sibling;
195}
196
188static void update_siblings_masks(unsigned int cpuid) 197static void update_siblings_masks(unsigned int cpuid)
189{ 198{
190 struct cputopo_arm *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; 199 struct cputopo_arm *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
@@ -266,6 +275,20 @@ void store_cpu_topology(unsigned int cpuid)
266 cpu_topology[cpuid].socket_id, mpidr); 275 cpu_topology[cpuid].socket_id, mpidr);
267} 276}
268 277
278static inline const int cpu_corepower_flags(void)
279{
280 return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN;
281}
282
283static struct sched_domain_topology_level arm_topology[] = {
284#ifdef CONFIG_SCHED_MC
285 { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
286 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
287#endif
288 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
289 { NULL, },
290};
291
269/* 292/*
270 * init_cpu_topology is called at boot when only one cpu is running 293 * init_cpu_topology is called at boot when only one cpu is running
271 * which prevent simultaneous write access to cpu_topology array 294 * which prevent simultaneous write access to cpu_topology array
@@ -289,4 +312,7 @@ void __init init_cpu_topology(void)
289 smp_wmb(); 312 smp_wmb();
290 313
291 parse_dt_topology(); 314 parse_dt_topology();
315
316 /* Set scheduler topology descriptor */
317 set_sched_topology(arm_topology);
292} 318}
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 720e70b66ffd..7b8e3a2a00fb 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -95,13 +95,11 @@ static inline struct thread_info *current_thread_info(void)
95 * TIF_NEED_RESCHED - rescheduling necessary 95 * TIF_NEED_RESCHED - rescheduling necessary
96 * TIF_NOTIFY_RESUME - callback before returning to user 96 * TIF_NOTIFY_RESUME - callback before returning to user
97 * TIF_USEDFPU - FPU was used by this task this quantum (SMP) 97 * TIF_USEDFPU - FPU was used by this task this quantum (SMP)
98 * TIF_POLLING_NRFLAG - true if poll_idle() is polling TIF_NEED_RESCHED
99 */ 98 */
100#define TIF_SIGPENDING 0 99#define TIF_SIGPENDING 0
101#define TIF_NEED_RESCHED 1 100#define TIF_NEED_RESCHED 1
102#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ 101#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
103#define TIF_SYSCALL_TRACE 8 102#define TIF_SYSCALL_TRACE 8
104#define TIF_POLLING_NRFLAG 16
105#define TIF_MEMDIE 18 /* is terminating due to OOM killer */ 103#define TIF_MEMDIE 18 /* is terminating due to OOM killer */
106#define TIF_FREEZE 19 104#define TIF_FREEZE 19
107#define TIF_RESTORE_SIGMASK 20 105#define TIF_RESTORE_SIGMASK 20
diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index 5957cf61f898..5b17418b4223 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -107,6 +107,7 @@ struct thread_info {
107#define TIF_MCA_INIT 18 /* this task is processing MCA or INIT */ 107#define TIF_MCA_INIT 18 /* this task is processing MCA or INIT */
108#define TIF_DB_DISABLED 19 /* debug trap disabled for fsyscall */ 108#define TIF_DB_DISABLED 19 /* debug trap disabled for fsyscall */
109#define TIF_RESTORE_RSE 21 /* user RBS is newer than kernel RBS */ 109#define TIF_RESTORE_RSE 21 /* user RBS is newer than kernel RBS */
110#define TIF_POLLING_NRFLAG 22 /* idle is polling for TIF_NEED_RESCHED */
110 111
111#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) 112#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
112#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 113#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
@@ -118,6 +119,7 @@ struct thread_info {
118#define _TIF_MCA_INIT (1 << TIF_MCA_INIT) 119#define _TIF_MCA_INIT (1 << TIF_MCA_INIT)
119#define _TIF_DB_DISABLED (1 << TIF_DB_DISABLED) 120#define _TIF_DB_DISABLED (1 << TIF_DB_DISABLED)
120#define _TIF_RESTORE_RSE (1 << TIF_RESTORE_RSE) 121#define _TIF_RESTORE_RSE (1 << TIF_RESTORE_RSE)
122#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
121 123
122/* "work to do on user-return" bits */ 124/* "work to do on user-return" bits */
123#define TIF_ALLWORK_MASK (_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SYSCALL_AUDIT|\ 125#define TIF_ALLWORK_MASK (_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SYSCALL_AUDIT|\
@@ -125,7 +127,6 @@ struct thread_info {
125/* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */ 127/* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */
126#define TIF_WORK_MASK (TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)) 128#define TIF_WORK_MASK (TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT))
127 129
128#define TS_POLLING 1 /* true if in idle loop and not sleeping */
129#define TS_RESTORE_SIGMASK 2 /* restore signal mask in do_signal() */ 130#define TS_RESTORE_SIGMASK 2 /* restore signal mask in do_signal() */
130 131
131#ifndef __ASSEMBLY__ 132#ifndef __ASSEMBLY__
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 5cb55a1e606b..3202aa74e0d6 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -46,30 +46,6 @@
46 46
47void build_cpu_to_node_map(void); 47void build_cpu_to_node_map(void);
48 48
49#define SD_CPU_INIT (struct sched_domain) { \
50 .parent = NULL, \
51 .child = NULL, \
52 .groups = NULL, \
53 .min_interval = 1, \
54 .max_interval = 4, \
55 .busy_factor = 64, \
56 .imbalance_pct = 125, \
57 .cache_nice_tries = 2, \
58 .busy_idx = 2, \
59 .idle_idx = 1, \
60 .newidle_idx = 0, \
61 .wake_idx = 0, \
62 .forkexec_idx = 0, \
63 .flags = SD_LOAD_BALANCE \
64 | SD_BALANCE_NEWIDLE \
65 | SD_BALANCE_EXEC \
66 | SD_BALANCE_FORK \
67 | SD_WAKE_AFFINE, \
68 .last_balance = jiffies, \
69 .balance_interval = 1, \
70 .nr_balance_failed = 0, \
71}
72
73#endif /* CONFIG_NUMA */ 49#endif /* CONFIG_NUMA */
74 50
75#ifdef CONFIG_SMP 51#ifdef CONFIG_SMP
diff --git a/arch/metag/include/asm/thread_info.h b/arch/metag/include/asm/thread_info.h
index b19e9c588a16..47711336119e 100644
--- a/arch/metag/include/asm/thread_info.h
+++ b/arch/metag/include/asm/thread_info.h
@@ -117,10 +117,8 @@ static inline int kstack_end(void *addr)
117#define TIF_SECCOMP 5 /* secure computing */ 117#define TIF_SECCOMP 5 /* secure computing */
118#define TIF_RESTORE_SIGMASK 6 /* restore signal mask in do_signal() */ 118#define TIF_RESTORE_SIGMASK 6 /* restore signal mask in do_signal() */
119#define TIF_NOTIFY_RESUME 7 /* callback before returning to user */ 119#define TIF_NOTIFY_RESUME 7 /* callback before returning to user */
120#define TIF_POLLING_NRFLAG 8 /* true if poll_idle() is polling 120#define TIF_MEMDIE 8 /* is terminating due to OOM killer */
121 TIF_NEED_RESCHED */ 121#define TIF_SYSCALL_TRACEPOINT 9 /* syscall tracepoint instrumentation */
122#define TIF_MEMDIE 9 /* is terminating due to OOM killer */
123#define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint instrumentation */
124 122
125 123
126#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) 124#define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index e2a4232c5871..10ffffef0414 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -766,6 +766,28 @@ int setup_profiling_timer(unsigned int multiplier)
766 return 0; 766 return 0;
767} 767}
768 768
769#ifdef CONFIG_SCHED_SMT
770/* cpumask of CPUs with asymetric SMT dependancy */
771static const int powerpc_smt_flags(void)
772{
773 int flags = SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
774
775 if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
776 printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
777 flags |= SD_ASYM_PACKING;
778 }
779 return flags;
780}
781#endif
782
783static struct sched_domain_topology_level powerpc_topology[] = {
784#ifdef CONFIG_SCHED_SMT
785 { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
786#endif
787 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
788 { NULL, },
789};
790
769void __init smp_cpus_done(unsigned int max_cpus) 791void __init smp_cpus_done(unsigned int max_cpus)
770{ 792{
771 cpumask_var_t old_mask; 793 cpumask_var_t old_mask;
@@ -790,15 +812,8 @@ void __init smp_cpus_done(unsigned int max_cpus)
790 812
791 dump_numa_cpu_topology(); 813 dump_numa_cpu_topology();
792 814
793} 815 set_sched_topology(powerpc_topology);
794 816
795int arch_sd_sibling_asym_packing(void)
796{
797 if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
798 printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
799 return SD_ASYM_PACKING;
800 }
801 return 0;
802} 817}
803 818
804#ifdef CONFIG_HOTPLUG_CPU 819#ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index 05425b18c0aa..56af53093d24 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -26,21 +26,12 @@ extern struct cpu_topology_s390 cpu_topology[NR_CPUS];
26 26
27#define mc_capable() 1 27#define mc_capable() 1
28 28
29static inline const struct cpumask *cpu_coregroup_mask(int cpu)
30{
31 return &cpu_topology[cpu].core_mask;
32}
33
34static inline const struct cpumask *cpu_book_mask(int cpu)
35{
36 return &cpu_topology[cpu].book_mask;
37}
38
39int topology_cpu_init(struct cpu *); 29int topology_cpu_init(struct cpu *);
40int topology_set_cpu_management(int fc); 30int topology_set_cpu_management(int fc);
41void topology_schedule_update(void); 31void topology_schedule_update(void);
42void store_topology(struct sysinfo_15_1_x *info); 32void store_topology(struct sysinfo_15_1_x *info);
43void topology_expect_change(void); 33void topology_expect_change(void);
34const struct cpumask *cpu_coregroup_mask(int cpu);
44 35
45#else /* CONFIG_SCHED_BOOK */ 36#else /* CONFIG_SCHED_BOOK */
46 37
@@ -64,8 +55,6 @@ static inline void s390_init_cpu_topology(void)
64}; 55};
65#endif 56#endif
66 57
67#define SD_BOOK_INIT SD_CPU_INIT
68
69#include <asm-generic/topology.h> 58#include <asm-generic/topology.h>
70 59
71#endif /* _ASM_S390_TOPOLOGY_H */ 60#endif /* _ASM_S390_TOPOLOGY_H */
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index fa3b8cdaadac..355a16c55702 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -445,6 +445,23 @@ int topology_cpu_init(struct cpu *cpu)
445 return sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group); 445 return sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group);
446} 446}
447 447
448const struct cpumask *cpu_coregroup_mask(int cpu)
449{
450 return &cpu_topology[cpu].core_mask;
451}
452
453static const struct cpumask *cpu_book_mask(int cpu)
454{
455 return &cpu_topology[cpu].book_mask;
456}
457
458static struct sched_domain_topology_level s390_topology[] = {
459 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
460 { cpu_book_mask, SD_INIT_NAME(BOOK) },
461 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
462 { NULL, },
463};
464
448static int __init topology_init(void) 465static int __init topology_init(void)
449{ 466{
450 if (!MACHINE_HAS_TOPOLOGY) { 467 if (!MACHINE_HAS_TOPOLOGY) {
@@ -453,6 +470,9 @@ static int __init topology_init(void)
453 } 470 }
454 set_topology_timer(); 471 set_topology_timer();
455out: 472out:
473
474 set_sched_topology(s390_topology);
475
456 return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching); 476 return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching);
457} 477}
458device_initcall(topology_init); 478device_initcall(topology_init);
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index 729aa107f64e..d767ff9f59b9 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -129,6 +129,7 @@ extern void _cpu_idle(void);
129#define TIF_MEMDIE 7 /* OOM killer at work */ 129#define TIF_MEMDIE 7 /* OOM killer at work */
130#define TIF_NOTIFY_RESUME 8 /* callback before returning to user */ 130#define TIF_NOTIFY_RESUME 8 /* callback before returning to user */
131#define TIF_SYSCALL_TRACEPOINT 9 /* syscall tracepoint instrumentation */ 131#define TIF_SYSCALL_TRACEPOINT 9 /* syscall tracepoint instrumentation */
132#define TIF_POLLING_NRFLAG 10 /* idle is polling for TIF_NEED_RESCHED */
132 133
133#define _TIF_SIGPENDING (1<<TIF_SIGPENDING) 134#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
134#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) 135#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
@@ -140,6 +141,7 @@ extern void _cpu_idle(void);
140#define _TIF_MEMDIE (1<<TIF_MEMDIE) 141#define _TIF_MEMDIE (1<<TIF_MEMDIE)
141#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) 142#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
142#define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT) 143#define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
144#define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
143 145
144/* Work to do on any return to user space. */ 146/* Work to do on any return to user space. */
145#define _TIF_ALLWORK_MASK \ 147#define _TIF_ALLWORK_MASK \
@@ -162,7 +164,6 @@ extern void _cpu_idle(void);
162#ifdef __tilegx__ 164#ifdef __tilegx__
163#define TS_COMPAT 0x0001 /* 32-bit compatibility mode */ 165#define TS_COMPAT 0x0001 /* 32-bit compatibility mode */
164#endif 166#endif
165#define TS_POLLING 0x0004 /* in idle loop but not sleeping */
166#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal */ 167#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal */
167 168
168#ifndef __ASSEMBLY__ 169#ifndef __ASSEMBLY__
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h
index d15c0d8d550f..938311844233 100644
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -44,39 +44,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
44/* For now, use numa node -1 for global allocation. */ 44/* For now, use numa node -1 for global allocation. */
45#define pcibus_to_node(bus) ((void)(bus), -1) 45#define pcibus_to_node(bus) ((void)(bus), -1)
46 46
47/*
48 * TILE architecture has many cores integrated in one processor, so we need
49 * setup bigger balance_interval for both CPU/NODE scheduling domains to
50 * reduce process scheduling costs.
51 */
52
53/* sched_domains SD_CPU_INIT for TILE architecture */
54#define SD_CPU_INIT (struct sched_domain) { \
55 .min_interval = 4, \
56 .max_interval = 128, \
57 .busy_factor = 64, \
58 .imbalance_pct = 125, \
59 .cache_nice_tries = 1, \
60 .busy_idx = 2, \
61 .idle_idx = 1, \
62 .newidle_idx = 0, \
63 .wake_idx = 0, \
64 .forkexec_idx = 0, \
65 \
66 .flags = 1*SD_LOAD_BALANCE \
67 | 1*SD_BALANCE_NEWIDLE \
68 | 1*SD_BALANCE_EXEC \
69 | 1*SD_BALANCE_FORK \
70 | 0*SD_BALANCE_WAKE \
71 | 0*SD_WAKE_AFFINE \
72 | 0*SD_SHARE_CPUPOWER \
73 | 0*SD_SHARE_PKG_RESOURCES \
74 | 0*SD_SERIALIZE \
75 , \
76 .last_balance = jiffies, \
77 .balance_interval = 32, \
78}
79
80/* By definition, we create nodes based on online memory. */ 47/* By definition, we create nodes based on online memory. */
81#define node_has_online_mem(nid) 1 48#define node_has_online_mem(nid) 1
82 49
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 47e5de25ba79..854053889d4d 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
83#define TIF_FORK 18 /* ret_from_fork */ 83#define TIF_FORK 18 /* ret_from_fork */
84#define TIF_NOHZ 19 /* in adaptive nohz mode */ 84#define TIF_NOHZ 19 /* in adaptive nohz mode */
85#define TIF_MEMDIE 20 /* is terminating due to OOM killer */ 85#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
86#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
86#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ 87#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
87#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 88#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
88#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ 89#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
@@ -106,6 +107,7 @@ struct thread_info {
106#define _TIF_IA32 (1 << TIF_IA32) 107#define _TIF_IA32 (1 << TIF_IA32)
107#define _TIF_FORK (1 << TIF_FORK) 108#define _TIF_FORK (1 << TIF_FORK)
108#define _TIF_NOHZ (1 << TIF_NOHZ) 109#define _TIF_NOHZ (1 << TIF_NOHZ)
110#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
109#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) 111#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
110#define _TIF_FORCED_TF (1 << TIF_FORCED_TF) 112#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
111#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) 113#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
@@ -191,8 +193,6 @@ static inline struct thread_info *current_thread_info(void)
191 * have to worry about atomic accesses. 193 * have to worry about atomic accesses.
192 */ 194 */
193#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ 195#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
194#define TS_POLLING 0x0004 /* idle task polling need_resched,
195 skip sending interrupt */
196#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ 196#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
197 197
198#ifndef __ASSEMBLY__ 198#ifndef __ASSEMBLY__
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 3ab03430211d..f3a1f04ed4cb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -844,21 +844,10 @@ static int apm_do_idle(void)
844 int polling; 844 int polling;
845 int err = 0; 845 int err = 0;
846 846
847 polling = !!(current_thread_info()->status & TS_POLLING);
848 if (polling) {
849 current_thread_info()->status &= ~TS_POLLING;
850 /*
851 * TS_POLLING-cleared state must be visible before we
852 * test NEED_RESCHED:
853 */
854 smp_mb();
855 }
856 if (!need_resched()) { 847 if (!need_resched()) {
857 idled = 1; 848 idled = 1;
858 ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err); 849 ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
859 } 850 }
860 if (polling)
861 current_thread_info()->status |= TS_POLLING;
862 851
863 if (!idled) 852 if (!idled)
864 return 0; 853 return 0;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f70a230a2945..6cb1beb47c25 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -548,7 +548,7 @@ static int loop_thread(void *data)
548 struct loop_device *lo = data; 548 struct loop_device *lo = data;
549 struct bio *bio; 549 struct bio *bio;
550 550
551 set_user_nice(current, -20); 551 set_user_nice(current, MIN_NICE);
552 552
553 while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) { 553 while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) {
554 554
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 3a70ea2f7cd6..56a027d6115e 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -533,7 +533,7 @@ static int nbd_thread(void *data)
533 struct nbd_device *nbd = data; 533 struct nbd_device *nbd = data;
534 struct request *req; 534 struct request *req;
535 535
536 set_user_nice(current, -20); 536 set_user_nice(current, MIN_NICE);
537 while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { 537 while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
538 /* wait for something to do */ 538 /* wait for something to do */
539 wait_event_interruptible(nbd->waiting_wq, 539 wait_event_interruptible(nbd->waiting_wq,
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index a2af73db187b..ef166ad2dbad 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1463,7 +1463,7 @@ static int kcdrwd(void *foobar)
1463 struct packet_data *pkt; 1463 struct packet_data *pkt;
1464 long min_sleep_time, residue; 1464 long min_sleep_time, residue;
1465 1465
1466 set_user_nice(current, -20); 1466 set_user_nice(current, MIN_NICE);
1467 set_freezable(); 1467 set_freezable();
1468 1468
1469 for (;;) { 1469 for (;;) {
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 1c4bb4f6ce93..5d665680ae33 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -1007,7 +1007,7 @@ static int ipmi_thread(void *data)
1007 struct timespec busy_until; 1007 struct timespec busy_until;
1008 1008
1009 ipmi_si_set_not_busy(&busy_until); 1009 ipmi_si_set_not_busy(&busy_until);
1010 set_user_nice(current, 19); 1010 set_user_nice(current, MAX_NICE);
1011 while (!kthread_should_stop()) { 1011 while (!kthread_should_stop()) {
1012 int busy_wait; 1012 int busy_wait;
1013 1013
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 8236746e46bb..cb7019977c50 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -32,6 +32,7 @@ LIST_HEAD(cpuidle_detected_devices);
32static int enabled_devices; 32static int enabled_devices;
33static int off __read_mostly; 33static int off __read_mostly;
34static int initialized __read_mostly; 34static int initialized __read_mostly;
35static bool use_deepest_state __read_mostly;
35 36
36int cpuidle_disabled(void) 37int cpuidle_disabled(void)
37{ 38{
@@ -65,23 +66,42 @@ int cpuidle_play_dead(void)
65} 66}
66 67
67/** 68/**
68 * cpuidle_enabled - check if the cpuidle framework is ready 69 * cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode.
69 * @dev: cpuidle device for this cpu 70 * @enable: Whether enable or disable the feature.
70 * @drv: cpuidle driver for this cpu 71 *
72 * If the "deepest idle" mode is enabled, cpuidle will ignore the governor and
73 * always use the state with the greatest exit latency (out of the states that
74 * are not disabled).
71 * 75 *
72 * Return 0 on success, otherwise: 76 * This function can only be called after cpuidle_pause() to avoid races.
73 * -NODEV : the cpuidle framework is not available
74 * -EBUSY : the cpuidle framework is not initialized
75 */ 77 */
76int cpuidle_enabled(struct cpuidle_driver *drv, struct cpuidle_device *dev) 78void cpuidle_use_deepest_state(bool enable)
77{ 79{
78 if (off || !initialized) 80 use_deepest_state = enable;
79 return -ENODEV; 81}
80 82
81 if (!drv || !dev || !dev->enabled) 83/**
82 return -EBUSY; 84 * cpuidle_find_deepest_state - Find the state of the greatest exit latency.
85 * @drv: cpuidle driver for a given CPU.
86 * @dev: cpuidle device for a given CPU.
87 */
88static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
89 struct cpuidle_device *dev)
90{
91 unsigned int latency_req = 0;
92 int i, ret = CPUIDLE_DRIVER_STATE_START - 1;
83 93
84 return 0; 94 for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
95 struct cpuidle_state *s = &drv->states[i];
96 struct cpuidle_state_usage *su = &dev->states_usage[i];
97
98 if (s->disabled || su->disable || s->exit_latency <= latency_req)
99 continue;
100
101 latency_req = s->exit_latency;
102 ret = i;
103 }
104 return ret;
85} 105}
86 106
87/** 107/**
@@ -138,6 +158,15 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
138 */ 158 */
139int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) 159int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
140{ 160{
161 if (off || !initialized)
162 return -ENODEV;
163
164 if (!drv || !dev || !dev->enabled)
165 return -EBUSY;
166
167 if (unlikely(use_deepest_state))
168 return cpuidle_find_deepest_state(drv, dev);
169
141 return cpuidle_curr_governor->select(drv, dev); 170 return cpuidle_curr_governor->select(drv, dev);
142} 171}
143 172
@@ -169,7 +198,7 @@ int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev,
169 */ 198 */
170void cpuidle_reflect(struct cpuidle_device *dev, int index) 199void cpuidle_reflect(struct cpuidle_device *dev, int index)
171{ 200{
172 if (cpuidle_curr_governor->reflect) 201 if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state))
173 cpuidle_curr_governor->reflect(dev, index); 202 cpuidle_curr_governor->reflect(dev, index);
174} 203}
175 204
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 71b523293354..c4f80c15a48d 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -296,7 +296,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
296 data->needs_update = 0; 296 data->needs_update = 0;
297 } 297 }
298 298
299 data->last_state_idx = 0; 299 data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
300 300
301 /* Special case when user has set very strict latency requirement */ 301 /* Special case when user has set very strict latency requirement */
302 if (unlikely(latency_req == 0)) 302 if (unlikely(latency_req == 0))
@@ -311,13 +311,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
311 data->bucket = which_bucket(data->next_timer_us); 311 data->bucket = which_bucket(data->next_timer_us);
312 312
313 /* 313 /*
314 * if the correction factor is 0 (eg first time init or cpu hotplug
315 * etc), we actually want to start out with a unity factor.
316 */
317 if (data->correction_factor[data->bucket] == 0)
318 data->correction_factor[data->bucket] = RESOLUTION * DECAY;
319
320 /*
321 * Force the result of multiplication to be 64 bits even if both 314 * Force the result of multiplication to be 64 bits even if both
322 * operands are 32 bits. 315 * operands are 32 bits.
323 * Make sure to round up for half microseconds. 316 * Make sure to round up for half microseconds.
@@ -466,9 +459,17 @@ static int menu_enable_device(struct cpuidle_driver *drv,
466 struct cpuidle_device *dev) 459 struct cpuidle_device *dev)
467{ 460{
468 struct menu_device *data = &per_cpu(menu_devices, dev->cpu); 461 struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
462 int i;
469 463
470 memset(data, 0, sizeof(struct menu_device)); 464 memset(data, 0, sizeof(struct menu_device));
471 465
466 /*
467 * if the correction factor is 0 (eg first time init or cpu hotplug
468 * etc), we actually want to start out with a unity factor.
469 */
470 for(i = 0; i < BUCKETS; i++)
471 data->correction_factor[i] = RESOLUTION * DECAY;
472
472 return 0; 473 return 0;
473} 474}
474 475
diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index ab3baa7f9508..8eec1653c9cc 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -1803,7 +1803,7 @@ static int ap_poll_thread(void *data)
1803 int requests; 1803 int requests;
1804 struct ap_device *ap_dev; 1804 struct ap_device *ap_dev;
1805 1805
1806 set_user_nice(current, 19); 1806 set_user_nice(current, MAX_NICE);
1807 while (1) { 1807 while (1) {
1808 if (ap_suspend_flag) 1808 if (ap_suspend_flag)
1809 return 0; 1809 return 0;
diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
index 1d41f4b9114f..f54843023466 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
@@ -464,7 +464,7 @@ static int bnx2fc_l2_rcv_thread(void *arg)
464 struct fcoe_percpu_s *bg = arg; 464 struct fcoe_percpu_s *bg = arg;
465 struct sk_buff *skb; 465 struct sk_buff *skb;
466 466
467 set_user_nice(current, -20); 467 set_user_nice(current, MIN_NICE);
468 set_current_state(TASK_INTERRUPTIBLE); 468 set_current_state(TASK_INTERRUPTIBLE);
469 while (!kthread_should_stop()) { 469 while (!kthread_should_stop()) {
470 schedule(); 470 schedule();
@@ -602,7 +602,7 @@ int bnx2fc_percpu_io_thread(void *arg)
602 struct bnx2fc_work *work, *tmp; 602 struct bnx2fc_work *work, *tmp;
603 LIST_HEAD(work_list); 603 LIST_HEAD(work_list);
604 604
605 set_user_nice(current, -20); 605 set_user_nice(current, MIN_NICE);
606 set_current_state(TASK_INTERRUPTIBLE); 606 set_current_state(TASK_INTERRUPTIBLE);
607 while (!kthread_should_stop()) { 607 while (!kthread_should_stop()) {
608 schedule(); 608 schedule();
diff --git a/drivers/scsi/bnx2i/bnx2i_hwi.c b/drivers/scsi/bnx2i/bnx2i_hwi.c
index b5ffd280a1ae..d6d491c2f004 100644
--- a/drivers/scsi/bnx2i/bnx2i_hwi.c
+++ b/drivers/scsi/bnx2i/bnx2i_hwi.c
@@ -1870,7 +1870,7 @@ int bnx2i_percpu_io_thread(void *arg)
1870 struct bnx2i_work *work, *tmp; 1870 struct bnx2i_work *work, *tmp;
1871 LIST_HEAD(work_list); 1871 LIST_HEAD(work_list);
1872 1872
1873 set_user_nice(current, -20); 1873 set_user_nice(current, MIN_NICE);
1874 1874
1875 while (!kthread_should_stop()) { 1875 while (!kthread_should_stop()) {
1876 spin_lock_bh(&p->p_work_lock); 1876 spin_lock_bh(&p->p_work_lock);
diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index d5e105b173f0..00ee0ed642aa 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -1872,7 +1872,7 @@ static int fcoe_percpu_receive_thread(void *arg)
1872 1872
1873 skb_queue_head_init(&tmp); 1873 skb_queue_head_init(&tmp);
1874 1874
1875 set_user_nice(current, -20); 1875 set_user_nice(current, MIN_NICE);
1876 1876
1877retry: 1877retry:
1878 while (!kthread_should_stop()) { 1878 while (!kthread_should_stop()) {
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 23f5ba5e6472..8dd47689d584 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -4515,7 +4515,7 @@ static int ibmvfc_work(void *data)
4515 struct ibmvfc_host *vhost = data; 4515 struct ibmvfc_host *vhost = data;
4516 int rc; 4516 int rc;
4517 4517
4518 set_user_nice(current, -20); 4518 set_user_nice(current, MIN_NICE);
4519 4519
4520 while (1) { 4520 while (1) {
4521 rc = wait_event_interruptible(vhost->work_wait_q, 4521 rc = wait_event_interruptible(vhost->work_wait_q,
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
index fa764406df68..2ebfb2bb0f42 100644
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -2213,7 +2213,7 @@ static int ibmvscsi_work(void *data)
2213 struct ibmvscsi_host_data *hostdata = data; 2213 struct ibmvscsi_host_data *hostdata = data;
2214 int rc; 2214 int rc;
2215 2215
2216 set_user_nice(current, -20); 2216 set_user_nice(current, MIN_NICE);
2217 2217
2218 while (1) { 2218 while (1) {
2219 rc = wait_event_interruptible(hostdata->work_wait_q, 2219 rc = wait_event_interruptible(hostdata->work_wait_q,
diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c
index 59b51c529ba0..294c072e9083 100644
--- a/drivers/scsi/lpfc/lpfc_hbadisc.c
+++ b/drivers/scsi/lpfc/lpfc_hbadisc.c
@@ -731,7 +731,7 @@ lpfc_do_work(void *p)
731 struct lpfc_hba *phba = p; 731 struct lpfc_hba *phba = p;
732 int rc; 732 int rc;
733 733
734 set_user_nice(current, -20); 734 set_user_nice(current, MIN_NICE);
735 current->flags |= PF_NOFREEZE; 735 current->flags |= PF_NOFREEZE;
736 phba->data_flags = 0; 736 phba->data_flags = 0;
737 737
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 19e99cc33724..afc84814e9bb 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -4828,7 +4828,7 @@ qla2x00_do_dpc(void *data)
4828 ha = (struct qla_hw_data *)data; 4828 ha = (struct qla_hw_data *)data;
4829 base_vha = pci_get_drvdata(ha->pdev); 4829 base_vha = pci_get_drvdata(ha->pdev);
4830 4830
4831 set_user_nice(current, -20); 4831 set_user_nice(current, MIN_NICE);
4832 4832
4833 set_current_state(TASK_INTERRUPTIBLE); 4833 set_current_state(TASK_INTERRUPTIBLE);
4834 while (!kthread_should_stop()) { 4834 while (!kthread_should_stop()) {
diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c
index 989f809f323f..a741da77828a 100644
--- a/drivers/staging/android/binder.c
+++ b/drivers/staging/android/binder.c
@@ -439,12 +439,12 @@ static void binder_set_nice(long nice)
439 set_user_nice(current, nice); 439 set_user_nice(current, nice);
440 return; 440 return;
441 } 441 }
442 min_nice = 20 - current->signal->rlim[RLIMIT_NICE].rlim_cur; 442 min_nice = rlimit_to_nice(current->signal->rlim[RLIMIT_NICE].rlim_cur);
443 binder_debug(BINDER_DEBUG_PRIORITY_CAP, 443 binder_debug(BINDER_DEBUG_PRIORITY_CAP,
444 "%d: nice value %ld not allowed use %ld instead\n", 444 "%d: nice value %ld not allowed use %ld instead\n",
445 current->pid, nice, min_nice); 445 current->pid, nice, min_nice);
446 set_user_nice(current, min_nice); 446 set_user_nice(current, min_nice);
447 if (min_nice < 20) 447 if (min_nice <= MAX_NICE)
448 return; 448 return;
449 binder_user_error("%d RLIMIT_NICE not set\n", current->pid); 449 binder_user_error("%d RLIMIT_NICE not set\n", current->pid);
450} 450}
diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c
index b9694b8cb5dd..0ff8c3362a8d 100644
--- a/drivers/staging/lustre/lustre/llite/lloop.c
+++ b/drivers/staging/lustre/lustre/llite/lloop.c
@@ -404,7 +404,7 @@ static int loop_thread(void *data)
404 int refcheck; 404 int refcheck;
405 int ret = 0; 405 int ret = 0;
406 406
407 set_user_nice(current, -20); 407 set_user_nice(current, MIN_NICE);
408 408
409 lo->lo_state = LLOOP_BOUND; 409 lo->lo_state = LLOOP_BOUND;
410 410
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index bf482dfed14f..73039295d0d1 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1107,7 +1107,7 @@ static int o2hb_thread(void *data)
1107 1107
1108 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); 1108 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
1109 1109
1110 set_user_nice(current, -20); 1110 set_user_nice(current, MIN_NICE);
1111 1111
1112 /* Pin node */ 1112 /* Pin node */
1113 o2nm_depend_this_node(); 1113 o2nm_depend_this_node();
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index b0238cba440b..c51a436135c4 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -120,8 +120,6 @@ struct cpuidle_driver {
120#ifdef CONFIG_CPU_IDLE 120#ifdef CONFIG_CPU_IDLE
121extern void disable_cpuidle(void); 121extern void disable_cpuidle(void);
122 122
123extern int cpuidle_enabled(struct cpuidle_driver *drv,
124 struct cpuidle_device *dev);
125extern int cpuidle_select(struct cpuidle_driver *drv, 123extern int cpuidle_select(struct cpuidle_driver *drv,
126 struct cpuidle_device *dev); 124 struct cpuidle_device *dev);
127extern int cpuidle_enter(struct cpuidle_driver *drv, 125extern int cpuidle_enter(struct cpuidle_driver *drv,
@@ -145,13 +143,11 @@ extern void cpuidle_resume(void);
145extern int cpuidle_enable_device(struct cpuidle_device *dev); 143extern int cpuidle_enable_device(struct cpuidle_device *dev);
146extern void cpuidle_disable_device(struct cpuidle_device *dev); 144extern void cpuidle_disable_device(struct cpuidle_device *dev);
147extern int cpuidle_play_dead(void); 145extern int cpuidle_play_dead(void);
146extern void cpuidle_use_deepest_state(bool enable);
148 147
149extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); 148extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
150#else 149#else
151static inline void disable_cpuidle(void) { } 150static inline void disable_cpuidle(void) { }
152static inline int cpuidle_enabled(struct cpuidle_driver *drv,
153 struct cpuidle_device *dev)
154{return -ENODEV; }
155static inline int cpuidle_select(struct cpuidle_driver *drv, 151static inline int cpuidle_select(struct cpuidle_driver *drv,
156 struct cpuidle_device *dev) 152 struct cpuidle_device *dev)
157{return -ENODEV; } 153{return -ENODEV; }
@@ -180,6 +176,7 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev)
180{return -ENODEV; } 176{return -ENODEV; }
181static inline void cpuidle_disable_device(struct cpuidle_device *dev) { } 177static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
182static inline int cpuidle_play_dead(void) {return -ENODEV; } 178static inline int cpuidle_play_dead(void) {return -ENODEV; }
179static inline void cpuidle_use_deepest_state(bool enable) {}
183static inline struct cpuidle_driver *cpuidle_get_cpu_driver( 180static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
184 struct cpuidle_device *dev) {return NULL; } 181 struct cpuidle_device *dev) {return NULL; }
185#endif 182#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4dce5d844b74..70f67e4e6156 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -870,6 +870,7 @@ enum cpu_idle_type {
870#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ 870#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
871#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ 871#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
872#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ 872#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
873#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
873#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ 874#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
874#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ 875#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
875#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ 876#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
@@ -877,7 +878,26 @@ enum cpu_idle_type {
877#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ 878#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
878#define SD_NUMA 0x4000 /* cross-node balancing */ 879#define SD_NUMA 0x4000 /* cross-node balancing */
879 880
880extern int __weak arch_sd_sibiling_asym_packing(void); 881#ifdef CONFIG_SCHED_SMT
882static inline const int cpu_smt_flags(void)
883{
884 return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
885}
886#endif
887
888#ifdef CONFIG_SCHED_MC
889static inline const int cpu_core_flags(void)
890{
891 return SD_SHARE_PKG_RESOURCES;
892}
893#endif
894
895#ifdef CONFIG_NUMA
896static inline const int cpu_numa_flags(void)
897{
898 return SD_NUMA;
899}
900#endif
881 901
882struct sched_domain_attr { 902struct sched_domain_attr {
883 int relax_domain_level; 903 int relax_domain_level;
@@ -985,6 +1005,38 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
985 1005
986bool cpus_share_cache(int this_cpu, int that_cpu); 1006bool cpus_share_cache(int this_cpu, int that_cpu);
987 1007
1008typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
1009typedef const int (*sched_domain_flags_f)(void);
1010
1011#define SDTL_OVERLAP 0x01
1012
1013struct sd_data {
1014 struct sched_domain **__percpu sd;
1015 struct sched_group **__percpu sg;
1016 struct sched_group_power **__percpu sgp;
1017};
1018
1019struct sched_domain_topology_level {
1020 sched_domain_mask_f mask;
1021 sched_domain_flags_f sd_flags;
1022 int flags;
1023 int numa_level;
1024 struct sd_data data;
1025#ifdef CONFIG_SCHED_DEBUG
1026 char *name;
1027#endif
1028};
1029
1030extern struct sched_domain_topology_level *sched_domain_topology;
1031
1032extern void set_sched_topology(struct sched_domain_topology_level *tl);
1033
1034#ifdef CONFIG_SCHED_DEBUG
1035# define SD_INIT_NAME(type) .name = #type
1036#else
1037# define SD_INIT_NAME(type)
1038#endif
1039
988#else /* CONFIG_SMP */ 1040#else /* CONFIG_SMP */
989 1041
990struct sched_domain_attr; 1042struct sched_domain_attr;
@@ -1123,8 +1175,8 @@ struct sched_dl_entity {
1123 1175
1124 /* 1176 /*
1125 * Original scheduling parameters. Copied here from sched_attr 1177 * Original scheduling parameters. Copied here from sched_attr
1126 * during sched_setscheduler2(), they will remain the same until 1178 * during sched_setattr(), they will remain the same until
1127 * the next sched_setscheduler2(). 1179 * the next sched_setattr().
1128 */ 1180 */
1129 u64 dl_runtime; /* maximum runtime for each instance */ 1181 u64 dl_runtime; /* maximum runtime for each instance */
1130 u64 dl_deadline; /* relative deadline of each instance */ 1182 u64 dl_deadline; /* relative deadline of each instance */
@@ -2723,51 +2775,9 @@ static inline int spin_needbreak(spinlock_t *lock)
2723 2775
2724/* 2776/*
2725 * Idle thread specific functions to determine the need_resched 2777 * Idle thread specific functions to determine the need_resched
2726 * polling state. We have two versions, one based on TS_POLLING in 2778 * polling state.
2727 * thread_info.status and one based on TIF_POLLING_NRFLAG in
2728 * thread_info.flags
2729 */ 2779 */
2730#ifdef TS_POLLING 2780#ifdef TIF_POLLING_NRFLAG
2731static inline int tsk_is_polling(struct task_struct *p)
2732{
2733 return task_thread_info(p)->status & TS_POLLING;
2734}
2735static inline void __current_set_polling(void)
2736{
2737 current_thread_info()->status |= TS_POLLING;
2738}
2739
2740static inline bool __must_check current_set_polling_and_test(void)
2741{
2742 __current_set_polling();
2743
2744 /*
2745 * Polling state must be visible before we test NEED_RESCHED,
2746 * paired by resched_task()
2747 */
2748 smp_mb();
2749
2750 return unlikely(tif_need_resched());
2751}
2752
2753static inline void __current_clr_polling(void)
2754{
2755 current_thread_info()->status &= ~TS_POLLING;
2756}
2757
2758static inline bool __must_check current_clr_polling_and_test(void)
2759{
2760 __current_clr_polling();
2761
2762 /*
2763 * Polling state must be visible before we test NEED_RESCHED,
2764 * paired by resched_task()
2765 */
2766 smp_mb();
2767
2768 return unlikely(tif_need_resched());
2769}
2770#elif defined(TIF_POLLING_NRFLAG)
2771static inline int tsk_is_polling(struct task_struct *p) 2781static inline int tsk_is_polling(struct task_struct *p)
2772{ 2782{
2773 return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); 2783 return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
index ac322583c820..d9cf5a5762d9 100644
--- a/include/linux/sched/prio.h
+++ b/include/linux/sched/prio.h
@@ -41,4 +41,20 @@
41#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) 41#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
42#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 42#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
43 43
44/*
45 * Convert nice value [19,-20] to rlimit style value [1,40].
46 */
47static inline long nice_to_rlimit(long nice)
48{
49 return (MAX_NICE - nice + 1);
50}
51
52/*
53 * Convert rlimit style value [1,40] to nice value [-20, 19].
54 */
55static inline long rlimit_to_nice(long prio)
56{
57 return (MAX_NICE - prio + 1);
58}
59
44#endif /* _SCHED_PRIO_H */ 60#endif /* _SCHED_PRIO_H */
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index fddbe2023a5d..cb0cec94fda3 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -104,20 +104,6 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
104#define test_thread_flag(flag) \ 104#define test_thread_flag(flag) \
105 test_ti_thread_flag(current_thread_info(), flag) 105 test_ti_thread_flag(current_thread_info(), flag)
106 106
107static inline __deprecated void set_need_resched(void)
108{
109 /*
110 * Use of this function in deprecated.
111 *
112 * As of this writing there are only a few users in the DRM tree left
113 * all of which are wrong and can be removed without causing too much
114 * grief.
115 *
116 * The DRM people are aware and are working on removing the last few
117 * instances.
118 */
119}
120
121#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) 107#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
122 108
123#if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK 109#if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 7062330a1329..973671ff9e7d 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -66,121 +66,6 @@ int arch_update_cpu_topology(void);
66#define PENALTY_FOR_NODE_WITH_CPUS (1) 66#define PENALTY_FOR_NODE_WITH_CPUS (1)
67#endif 67#endif
68 68
69/*
70 * Below are the 3 major initializers used in building sched_domains:
71 * SD_SIBLING_INIT, for SMT domains
72 * SD_CPU_INIT, for SMP domains
73 *
74 * Any architecture that cares to do any tuning to these values should do so
75 * by defining their own arch-specific initializer in include/asm/topology.h.
76 * A definition there will automagically override these default initializers
77 * and allow arch-specific performance tuning of sched_domains.
78 * (Only non-zero and non-null fields need be specified.)
79 */
80
81#ifdef CONFIG_SCHED_SMT
82/* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is,
83 * so can't we drop this in favor of CONFIG_SCHED_SMT?
84 */
85#define ARCH_HAS_SCHED_WAKE_IDLE
86/* Common values for SMT siblings */
87#ifndef SD_SIBLING_INIT
88#define SD_SIBLING_INIT (struct sched_domain) { \
89 .min_interval = 1, \
90 .max_interval = 2, \
91 .busy_factor = 64, \
92 .imbalance_pct = 110, \
93 \
94 .flags = 1*SD_LOAD_BALANCE \
95 | 1*SD_BALANCE_NEWIDLE \
96 | 1*SD_BALANCE_EXEC \
97 | 1*SD_BALANCE_FORK \
98 | 0*SD_BALANCE_WAKE \
99 | 1*SD_WAKE_AFFINE \
100 | 1*SD_SHARE_CPUPOWER \
101 | 1*SD_SHARE_PKG_RESOURCES \
102 | 0*SD_SERIALIZE \
103 | 0*SD_PREFER_SIBLING \
104 | arch_sd_sibling_asym_packing() \
105 , \
106 .last_balance = jiffies, \
107 .balance_interval = 1, \
108 .smt_gain = 1178, /* 15% */ \
109 .max_newidle_lb_cost = 0, \
110 .next_decay_max_lb_cost = jiffies, \
111}
112#endif
113#endif /* CONFIG_SCHED_SMT */
114
115#ifdef CONFIG_SCHED_MC
116/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
117#ifndef SD_MC_INIT
118#define SD_MC_INIT (struct sched_domain) { \
119 .min_interval = 1, \
120 .max_interval = 4, \
121 .busy_factor = 64, \
122 .imbalance_pct = 125, \
123 .cache_nice_tries = 1, \
124 .busy_idx = 2, \
125 .wake_idx = 0, \
126 .forkexec_idx = 0, \
127 \
128 .flags = 1*SD_LOAD_BALANCE \
129 | 1*SD_BALANCE_NEWIDLE \
130 | 1*SD_BALANCE_EXEC \
131 | 1*SD_BALANCE_FORK \
132 | 0*SD_BALANCE_WAKE \
133 | 1*SD_WAKE_AFFINE \
134 | 0*SD_SHARE_CPUPOWER \
135 | 1*SD_SHARE_PKG_RESOURCES \
136 | 0*SD_SERIALIZE \
137 , \
138 .last_balance = jiffies, \
139 .balance_interval = 1, \
140 .max_newidle_lb_cost = 0, \
141 .next_decay_max_lb_cost = jiffies, \
142}
143#endif
144#endif /* CONFIG_SCHED_MC */
145
146/* Common values for CPUs */
147#ifndef SD_CPU_INIT
148#define SD_CPU_INIT (struct sched_domain) { \
149 .min_interval = 1, \
150 .max_interval = 4, \
151 .busy_factor = 64, \
152 .imbalance_pct = 125, \
153 .cache_nice_tries = 1, \
154 .busy_idx = 2, \
155 .idle_idx = 1, \
156 .newidle_idx = 0, \
157 .wake_idx = 0, \
158 .forkexec_idx = 0, \
159 \
160 .flags = 1*SD_LOAD_BALANCE \
161 | 1*SD_BALANCE_NEWIDLE \
162 | 1*SD_BALANCE_EXEC \
163 | 1*SD_BALANCE_FORK \
164 | 0*SD_BALANCE_WAKE \
165 | 1*SD_WAKE_AFFINE \
166 | 0*SD_SHARE_CPUPOWER \
167 | 0*SD_SHARE_PKG_RESOURCES \
168 | 0*SD_SERIALIZE \
169 | 1*SD_PREFER_SIBLING \
170 , \
171 .last_balance = jiffies, \
172 .balance_interval = 1, \
173 .max_newidle_lb_cost = 0, \
174 .next_decay_max_lb_cost = jiffies, \
175}
176#endif
177
178#ifdef CONFIG_SCHED_BOOK
179#ifndef SD_BOOK_INIT
180#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
181#endif
182#endif /* CONFIG_SCHED_BOOK */
183
184#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 69#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
185DECLARE_PER_CPU(int, numa_node); 70DECLARE_PER_CPU(int, numa_node);
186 71
@@ -295,4 +180,17 @@ static inline int cpu_to_mem(int cpu)
295#define topology_core_cpumask(cpu) cpumask_of(cpu) 180#define topology_core_cpumask(cpu) cpumask_of(cpu)
296#endif 181#endif
297 182
183#ifdef CONFIG_SCHED_SMT
184static inline const struct cpumask *cpu_smt_mask(int cpu)
185{
186 return topology_thread_cpumask(cpu);
187}
188#endif
189
190static inline const struct cpumask *cpu_cpu_mask(int cpu)
191{
192 return cpumask_of_node(cpu_to_node(cpu));
193}
194
195
298#endif /* _LINUX_TOPOLOGY_H */ 196#endif /* _LINUX_TOPOLOGY_H */
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index dbafeac18e4d..0955b885d0dc 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -216,7 +216,7 @@ static int lock_torture_writer(void *arg)
216 static DEFINE_TORTURE_RANDOM(rand); 216 static DEFINE_TORTURE_RANDOM(rand);
217 217
218 VERBOSE_TOROUT_STRING("lock_torture_writer task started"); 218 VERBOSE_TOROUT_STRING("lock_torture_writer task started");
219 set_user_nice(current, 19); 219 set_user_nice(current, MAX_NICE);
220 220
221 do { 221 do {
222 if ((torture_random(&rand) & 0xfffff) == 0) 222 if ((torture_random(&rand) & 0xfffff) == 0)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8233cd4047d7..155721f7f909 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -54,9 +54,11 @@ static void freeze_begin(void)
54 54
55static void freeze_enter(void) 55static void freeze_enter(void)
56{ 56{
57 cpuidle_use_deepest_state(true);
57 cpuidle_resume(); 58 cpuidle_resume();
58 wait_event(suspend_freeze_wait_head, suspend_freeze_wake); 59 wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
59 cpuidle_pause(); 60 cpuidle_pause();
61 cpuidle_use_deepest_state(false);
60} 62}
61 63
62void freeze_wake(void) 64void freeze_wake(void)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a62a7dec3986..913c6d6cc2c1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -522,6 +522,39 @@ static inline void init_hrtick(void)
522#endif /* CONFIG_SCHED_HRTICK */ 522#endif /* CONFIG_SCHED_HRTICK */
523 523
524/* 524/*
525 * cmpxchg based fetch_or, macro so it works for different integer types
526 */
527#define fetch_or(ptr, val) \
528({ typeof(*(ptr)) __old, __val = *(ptr); \
529 for (;;) { \
530 __old = cmpxchg((ptr), __val, __val | (val)); \
531 if (__old == __val) \
532 break; \
533 __val = __old; \
534 } \
535 __old; \
536})
537
538#ifdef TIF_POLLING_NRFLAG
539/*
540 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
541 * this avoids any races wrt polling state changes and thereby avoids
542 * spurious IPIs.
543 */
544static bool set_nr_and_not_polling(struct task_struct *p)
545{
546 struct thread_info *ti = task_thread_info(p);
547 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
548}
549#else
550static bool set_nr_and_not_polling(struct task_struct *p)
551{
552 set_tsk_need_resched(p);
553 return true;
554}
555#endif
556
557/*
525 * resched_task - mark a task 'to be rescheduled now'. 558 * resched_task - mark a task 'to be rescheduled now'.
526 * 559 *
527 * On UP this means the setting of the need_resched flag, on SMP it 560 * On UP this means the setting of the need_resched flag, on SMP it
@@ -537,17 +570,15 @@ void resched_task(struct task_struct *p)
537 if (test_tsk_need_resched(p)) 570 if (test_tsk_need_resched(p))
538 return; 571 return;
539 572
540 set_tsk_need_resched(p);
541
542 cpu = task_cpu(p); 573 cpu = task_cpu(p);
574
543 if (cpu == smp_processor_id()) { 575 if (cpu == smp_processor_id()) {
576 set_tsk_need_resched(p);
544 set_preempt_need_resched(); 577 set_preempt_need_resched();
545 return; 578 return;
546 } 579 }
547 580
548 /* NEED_RESCHED must be visible before we test polling */ 581 if (set_nr_and_not_polling(p))
549 smp_mb();
550 if (!tsk_is_polling(p))
551 smp_send_reschedule(cpu); 582 smp_send_reschedule(cpu);
552} 583}
553 584
@@ -3018,7 +3049,7 @@ EXPORT_SYMBOL(set_user_nice);
3018int can_nice(const struct task_struct *p, const int nice) 3049int can_nice(const struct task_struct *p, const int nice)
3019{ 3050{
3020 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3051 /* convert nice value [19,-20] to rlimit style value [1,40] */
3021 int nice_rlim = 20 - nice; 3052 int nice_rlim = nice_to_rlimit(nice);
3022 3053
3023 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3054 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3024 capable(CAP_SYS_NICE)); 3055 capable(CAP_SYS_NICE));
@@ -3042,17 +3073,10 @@ SYSCALL_DEFINE1(nice, int, increment)
3042 * We don't have to worry. Conceptually one call occurs first 3073 * We don't have to worry. Conceptually one call occurs first
3043 * and we have a single winner. 3074 * and we have a single winner.
3044 */ 3075 */
3045 if (increment < -40) 3076 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
3046 increment = -40;
3047 if (increment > 40)
3048 increment = 40;
3049
3050 nice = task_nice(current) + increment; 3077 nice = task_nice(current) + increment;
3051 if (nice < MIN_NICE)
3052 nice = MIN_NICE;
3053 if (nice > MAX_NICE)
3054 nice = MAX_NICE;
3055 3078
3079 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
3056 if (increment < 0 && !can_nice(current, nice)) 3080 if (increment < 0 && !can_nice(current, nice))
3057 return -EPERM; 3081 return -EPERM;
3058 3082
@@ -3642,13 +3666,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
3642 */ 3666 */
3643 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); 3667 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
3644 3668
3645out: 3669 return 0;
3646 return ret;
3647 3670
3648err_size: 3671err_size:
3649 put_user(sizeof(*attr), &uattr->size); 3672 put_user(sizeof(*attr), &uattr->size);
3650 ret = -E2BIG; 3673 return -E2BIG;
3651 goto out;
3652} 3674}
3653 3675
3654/** 3676/**
@@ -3808,7 +3830,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
3808 3830
3809 for (; addr < end; addr++) { 3831 for (; addr < end; addr++) {
3810 if (*addr) 3832 if (*addr)
3811 goto err_size; 3833 return -EFBIG;
3812 } 3834 }
3813 3835
3814 attr->size = usize; 3836 attr->size = usize;
@@ -3818,12 +3840,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
3818 if (ret) 3840 if (ret)
3819 return -EFAULT; 3841 return -EFAULT;
3820 3842
3821out: 3843 return 0;
3822 return ret;
3823
3824err_size:
3825 ret = -E2BIG;
3826 goto out;
3827} 3844}
3828 3845
3829/** 3846/**
@@ -5093,10 +5110,20 @@ static struct notifier_block migration_notifier = {
5093 .priority = CPU_PRI_MIGRATION, 5110 .priority = CPU_PRI_MIGRATION,
5094}; 5111};
5095 5112
5113static void __cpuinit set_cpu_rq_start_time(void)
5114{
5115 int cpu = smp_processor_id();
5116 struct rq *rq = cpu_rq(cpu);
5117 rq->age_stamp = sched_clock_cpu(cpu);
5118}
5119
5096static int sched_cpu_active(struct notifier_block *nfb, 5120static int sched_cpu_active(struct notifier_block *nfb,
5097 unsigned long action, void *hcpu) 5121 unsigned long action, void *hcpu)
5098{ 5122{
5099 switch (action & ~CPU_TASKS_FROZEN) { 5123 switch (action & ~CPU_TASKS_FROZEN) {
5124 case CPU_STARTING:
5125 set_cpu_rq_start_time();
5126 return NOTIFY_OK;
5100 case CPU_DOWN_FAILED: 5127 case CPU_DOWN_FAILED:
5101 set_cpu_active((long)hcpu, true); 5128 set_cpu_active((long)hcpu, true);
5102 return NOTIFY_OK; 5129 return NOTIFY_OK;
@@ -5305,7 +5332,8 @@ static int sd_degenerate(struct sched_domain *sd)
5305 SD_BALANCE_FORK | 5332 SD_BALANCE_FORK |
5306 SD_BALANCE_EXEC | 5333 SD_BALANCE_EXEC |
5307 SD_SHARE_CPUPOWER | 5334 SD_SHARE_CPUPOWER |
5308 SD_SHARE_PKG_RESOURCES)) { 5335 SD_SHARE_PKG_RESOURCES |
5336 SD_SHARE_POWERDOMAIN)) {
5309 if (sd->groups != sd->groups->next) 5337 if (sd->groups != sd->groups->next)
5310 return 0; 5338 return 0;
5311 } 5339 }
@@ -5336,7 +5364,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5336 SD_BALANCE_EXEC | 5364 SD_BALANCE_EXEC |
5337 SD_SHARE_CPUPOWER | 5365 SD_SHARE_CPUPOWER |
5338 SD_SHARE_PKG_RESOURCES | 5366 SD_SHARE_PKG_RESOURCES |
5339 SD_PREFER_SIBLING); 5367 SD_PREFER_SIBLING |
5368 SD_SHARE_POWERDOMAIN);
5340 if (nr_node_ids == 1) 5369 if (nr_node_ids == 1)
5341 pflags &= ~SD_SERIALIZE; 5370 pflags &= ~SD_SERIALIZE;
5342 } 5371 }
@@ -5610,17 +5639,6 @@ static int __init isolated_cpu_setup(char *str)
5610 5639
5611__setup("isolcpus=", isolated_cpu_setup); 5640__setup("isolcpus=", isolated_cpu_setup);
5612 5641
5613static const struct cpumask *cpu_cpu_mask(int cpu)
5614{
5615 return cpumask_of_node(cpu_to_node(cpu));
5616}
5617
5618struct sd_data {
5619 struct sched_domain **__percpu sd;
5620 struct sched_group **__percpu sg;
5621 struct sched_group_power **__percpu sgp;
5622};
5623
5624struct s_data { 5642struct s_data {
5625 struct sched_domain ** __percpu sd; 5643 struct sched_domain ** __percpu sd;
5626 struct root_domain *rd; 5644 struct root_domain *rd;
@@ -5633,21 +5651,6 @@ enum s_alloc {
5633 sa_none, 5651 sa_none,
5634}; 5652};
5635 5653
5636struct sched_domain_topology_level;
5637
5638typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5639typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5640
5641#define SDTL_OVERLAP 0x01
5642
5643struct sched_domain_topology_level {
5644 sched_domain_init_f init;
5645 sched_domain_mask_f mask;
5646 int flags;
5647 int numa_level;
5648 struct sd_data data;
5649};
5650
5651/* 5654/*
5652 * Build an iteration mask that can exclude certain CPUs from the upwards 5655 * Build an iteration mask that can exclude certain CPUs from the upwards
5653 * domain traversal. 5656 * domain traversal.
@@ -5815,8 +5818,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5815 continue; 5818 continue;
5816 5819
5817 group = get_group(i, sdd, &sg); 5820 group = get_group(i, sdd, &sg);
5818 cpumask_clear(sched_group_cpus(sg));
5819 sg->sgp->power = 0;
5820 cpumask_setall(sched_group_mask(sg)); 5821 cpumask_setall(sched_group_mask(sg));
5821 5822
5822 for_each_cpu(j, span) { 5823 for_each_cpu(j, span) {
@@ -5866,44 +5867,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5866 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); 5867 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
5867} 5868}
5868 5869
5869int __weak arch_sd_sibling_asym_packing(void)
5870{
5871 return 0*SD_ASYM_PACKING;
5872}
5873
5874/* 5870/*
5875 * Initializers for schedule domains 5871 * Initializers for schedule domains
5876 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 5872 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
5877 */ 5873 */
5878 5874
5879#ifdef CONFIG_SCHED_DEBUG
5880# define SD_INIT_NAME(sd, type) sd->name = #type
5881#else
5882# define SD_INIT_NAME(sd, type) do { } while (0)
5883#endif
5884
5885#define SD_INIT_FUNC(type) \
5886static noinline struct sched_domain * \
5887sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5888{ \
5889 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5890 *sd = SD_##type##_INIT; \
5891 SD_INIT_NAME(sd, type); \
5892 sd->private = &tl->data; \
5893 return sd; \
5894}
5895
5896SD_INIT_FUNC(CPU)
5897#ifdef CONFIG_SCHED_SMT
5898 SD_INIT_FUNC(SIBLING)
5899#endif
5900#ifdef CONFIG_SCHED_MC
5901 SD_INIT_FUNC(MC)
5902#endif
5903#ifdef CONFIG_SCHED_BOOK
5904 SD_INIT_FUNC(BOOK)
5905#endif
5906
5907static int default_relax_domain_level = -1; 5875static int default_relax_domain_level = -1;
5908int sched_domain_level_max; 5876int sched_domain_level_max;
5909 5877
@@ -5991,99 +5959,154 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
5991 *per_cpu_ptr(sdd->sgp, cpu) = NULL; 5959 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
5992} 5960}
5993 5961
5994#ifdef CONFIG_SCHED_SMT
5995static const struct cpumask *cpu_smt_mask(int cpu)
5996{
5997 return topology_thread_cpumask(cpu);
5998}
5999#endif
6000
6001/*
6002 * Topology list, bottom-up.
6003 */
6004static struct sched_domain_topology_level default_topology[] = {
6005#ifdef CONFIG_SCHED_SMT
6006 { sd_init_SIBLING, cpu_smt_mask, },
6007#endif
6008#ifdef CONFIG_SCHED_MC
6009 { sd_init_MC, cpu_coregroup_mask, },
6010#endif
6011#ifdef CONFIG_SCHED_BOOK
6012 { sd_init_BOOK, cpu_book_mask, },
6013#endif
6014 { sd_init_CPU, cpu_cpu_mask, },
6015 { NULL, },
6016};
6017
6018static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6019
6020#define for_each_sd_topology(tl) \
6021 for (tl = sched_domain_topology; tl->init; tl++)
6022
6023#ifdef CONFIG_NUMA 5962#ifdef CONFIG_NUMA
6024
6025static int sched_domains_numa_levels; 5963static int sched_domains_numa_levels;
6026static int *sched_domains_numa_distance; 5964static int *sched_domains_numa_distance;
6027static struct cpumask ***sched_domains_numa_masks; 5965static struct cpumask ***sched_domains_numa_masks;
6028static int sched_domains_curr_level; 5966static int sched_domains_curr_level;
5967#endif
6029 5968
6030static inline int sd_local_flags(int level) 5969/*
6031{ 5970 * SD_flags allowed in topology descriptions.
6032 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) 5971 *
6033 return 0; 5972 * SD_SHARE_CPUPOWER - describes SMT topologies
6034 5973 * SD_SHARE_PKG_RESOURCES - describes shared caches
6035 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 5974 * SD_NUMA - describes NUMA topologies
6036} 5975 * SD_SHARE_POWERDOMAIN - describes shared power domain
5976 *
5977 * Odd one out:
5978 * SD_ASYM_PACKING - describes SMT quirks
5979 */
5980#define TOPOLOGY_SD_FLAGS \
5981 (SD_SHARE_CPUPOWER | \
5982 SD_SHARE_PKG_RESOURCES | \
5983 SD_NUMA | \
5984 SD_ASYM_PACKING | \
5985 SD_SHARE_POWERDOMAIN)
6037 5986
6038static struct sched_domain * 5987static struct sched_domain *
6039sd_numa_init(struct sched_domain_topology_level *tl, int cpu) 5988sd_init(struct sched_domain_topology_level *tl, int cpu)
6040{ 5989{
6041 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 5990 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6042 int level = tl->numa_level; 5991 int sd_weight, sd_flags = 0;
6043 int sd_weight = cpumask_weight( 5992
6044 sched_domains_numa_masks[level][cpu_to_node(cpu)]); 5993#ifdef CONFIG_NUMA
5994 /*
5995 * Ugly hack to pass state to sd_numa_mask()...
5996 */
5997 sched_domains_curr_level = tl->numa_level;
5998#endif
5999
6000 sd_weight = cpumask_weight(tl->mask(cpu));
6001
6002 if (tl->sd_flags)
6003 sd_flags = (*tl->sd_flags)();
6004 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
6005 "wrong sd_flags in topology description\n"))
6006 sd_flags &= ~TOPOLOGY_SD_FLAGS;
6045 6007
6046 *sd = (struct sched_domain){ 6008 *sd = (struct sched_domain){
6047 .min_interval = sd_weight, 6009 .min_interval = sd_weight,
6048 .max_interval = 2*sd_weight, 6010 .max_interval = 2*sd_weight,
6049 .busy_factor = 32, 6011 .busy_factor = 32,
6050 .imbalance_pct = 125, 6012 .imbalance_pct = 125,
6051 .cache_nice_tries = 2, 6013
6052 .busy_idx = 3, 6014 .cache_nice_tries = 0,
6053 .idle_idx = 2, 6015 .busy_idx = 0,
6016 .idle_idx = 0,
6054 .newidle_idx = 0, 6017 .newidle_idx = 0,
6055 .wake_idx = 0, 6018 .wake_idx = 0,
6056 .forkexec_idx = 0, 6019 .forkexec_idx = 0,
6057 6020
6058 .flags = 1*SD_LOAD_BALANCE 6021 .flags = 1*SD_LOAD_BALANCE
6059 | 1*SD_BALANCE_NEWIDLE 6022 | 1*SD_BALANCE_NEWIDLE
6060 | 0*SD_BALANCE_EXEC 6023 | 1*SD_BALANCE_EXEC
6061 | 0*SD_BALANCE_FORK 6024 | 1*SD_BALANCE_FORK
6062 | 0*SD_BALANCE_WAKE 6025 | 0*SD_BALANCE_WAKE
6063 | 0*SD_WAKE_AFFINE 6026 | 1*SD_WAKE_AFFINE
6064 | 0*SD_SHARE_CPUPOWER 6027 | 0*SD_SHARE_CPUPOWER
6065 | 0*SD_SHARE_PKG_RESOURCES 6028 | 0*SD_SHARE_PKG_RESOURCES
6066 | 1*SD_SERIALIZE 6029 | 0*SD_SERIALIZE
6067 | 0*SD_PREFER_SIBLING 6030 | 0*SD_PREFER_SIBLING
6068 | 1*SD_NUMA 6031 | 0*SD_NUMA
6069 | sd_local_flags(level) 6032 | sd_flags
6070 , 6033 ,
6034
6071 .last_balance = jiffies, 6035 .last_balance = jiffies,
6072 .balance_interval = sd_weight, 6036 .balance_interval = sd_weight,
6037 .smt_gain = 0,
6073 .max_newidle_lb_cost = 0, 6038 .max_newidle_lb_cost = 0,
6074 .next_decay_max_lb_cost = jiffies, 6039 .next_decay_max_lb_cost = jiffies,
6040#ifdef CONFIG_SCHED_DEBUG
6041 .name = tl->name,
6042#endif
6075 }; 6043 };
6076 SD_INIT_NAME(sd, NUMA);
6077 sd->private = &tl->data;
6078 6044
6079 /* 6045 /*
6080 * Ugly hack to pass state to sd_numa_mask()... 6046 * Convert topological properties into behaviour.
6081 */ 6047 */
6082 sched_domains_curr_level = tl->numa_level; 6048
6049 if (sd->flags & SD_SHARE_CPUPOWER) {
6050 sd->imbalance_pct = 110;
6051 sd->smt_gain = 1178; /* ~15% */
6052
6053 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
6054 sd->imbalance_pct = 117;
6055 sd->cache_nice_tries = 1;
6056 sd->busy_idx = 2;
6057
6058#ifdef CONFIG_NUMA
6059 } else if (sd->flags & SD_NUMA) {
6060 sd->cache_nice_tries = 2;
6061 sd->busy_idx = 3;
6062 sd->idle_idx = 2;
6063
6064 sd->flags |= SD_SERIALIZE;
6065 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
6066 sd->flags &= ~(SD_BALANCE_EXEC |
6067 SD_BALANCE_FORK |
6068 SD_WAKE_AFFINE);
6069 }
6070
6071#endif
6072 } else {
6073 sd->flags |= SD_PREFER_SIBLING;
6074 sd->cache_nice_tries = 1;
6075 sd->busy_idx = 2;
6076 sd->idle_idx = 1;
6077 }
6078
6079 sd->private = &tl->data;
6083 6080
6084 return sd; 6081 return sd;
6085} 6082}
6086 6083
6084/*
6085 * Topology list, bottom-up.
6086 */
6087static struct sched_domain_topology_level default_topology[] = {
6088#ifdef CONFIG_SCHED_SMT
6089 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
6090#endif
6091#ifdef CONFIG_SCHED_MC
6092 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
6093#endif
6094 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
6095 { NULL, },
6096};
6097
6098struct sched_domain_topology_level *sched_domain_topology = default_topology;
6099
6100#define for_each_sd_topology(tl) \
6101 for (tl = sched_domain_topology; tl->mask; tl++)
6102
6103void set_sched_topology(struct sched_domain_topology_level *tl)
6104{
6105 sched_domain_topology = tl;
6106}
6107
6108#ifdef CONFIG_NUMA
6109
6087static const struct cpumask *sd_numa_mask(int cpu) 6110static const struct cpumask *sd_numa_mask(int cpu)
6088{ 6111{
6089 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6112 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6227,7 +6250,10 @@ static void sched_init_numa(void)
6227 } 6250 }
6228 } 6251 }
6229 6252
6230 tl = kzalloc((ARRAY_SIZE(default_topology) + level) * 6253 /* Compute default topology size */
6254 for (i = 0; sched_domain_topology[i].mask; i++);
6255
6256 tl = kzalloc((i + level + 1) *
6231 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6257 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6232 if (!tl) 6258 if (!tl)
6233 return; 6259 return;
@@ -6235,18 +6261,19 @@ static void sched_init_numa(void)
6235 /* 6261 /*
6236 * Copy the default topology bits.. 6262 * Copy the default topology bits..
6237 */ 6263 */
6238 for (i = 0; default_topology[i].init; i++) 6264 for (i = 0; sched_domain_topology[i].mask; i++)
6239 tl[i] = default_topology[i]; 6265 tl[i] = sched_domain_topology[i];
6240 6266
6241 /* 6267 /*
6242 * .. and append 'j' levels of NUMA goodness. 6268 * .. and append 'j' levels of NUMA goodness.
6243 */ 6269 */
6244 for (j = 0; j < level; i++, j++) { 6270 for (j = 0; j < level; i++, j++) {
6245 tl[i] = (struct sched_domain_topology_level){ 6271 tl[i] = (struct sched_domain_topology_level){
6246 .init = sd_numa_init,
6247 .mask = sd_numa_mask, 6272 .mask = sd_numa_mask,
6273 .sd_flags = cpu_numa_flags,
6248 .flags = SDTL_OVERLAP, 6274 .flags = SDTL_OVERLAP,
6249 .numa_level = j, 6275 .numa_level = j,
6276 SD_INIT_NAME(NUMA)
6250 }; 6277 };
6251 } 6278 }
6252 6279
@@ -6404,7 +6431,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6404 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6431 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6405 struct sched_domain *child, int cpu) 6432 struct sched_domain *child, int cpu)
6406{ 6433{
6407 struct sched_domain *sd = tl->init(tl, cpu); 6434 struct sched_domain *sd = sd_init(tl, cpu);
6408 if (!sd) 6435 if (!sd)
6409 return child; 6436 return child;
6410 6437
@@ -6974,6 +7001,7 @@ void __init sched_init(void)
6974 if (cpu_isolated_map == NULL) 7001 if (cpu_isolated_map == NULL)
6975 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7002 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6976 idle_thread_set_boot_cpu(); 7003 idle_thread_set_boot_cpu();
7004 set_cpu_rq_start_time();
6977#endif 7005#endif
6978 init_sched_fair_class(); 7006 init_sched_fair_class();
6979 7007
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 800e99b99075..f9ca7d19781a 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -520,7 +520,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
520 * We need to take care of a possible races here. In fact, the 520 * We need to take care of a possible races here. In fact, the
521 * task might have changed its scheduling policy to something 521 * task might have changed its scheduling policy to something
522 * different from SCHED_DEADLINE or changed its reservation 522 * different from SCHED_DEADLINE or changed its reservation
523 * parameters (through sched_setscheduler()). 523 * parameters (through sched_setattr()).
524 */ 524 */
525 if (!dl_task(p) || dl_se->dl_new) 525 if (!dl_task(p) || dl_se->dl_new)
526 goto unlock; 526 goto unlock;
@@ -741,7 +741,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
741 741
742 WARN_ON(!dl_prio(prio)); 742 WARN_ON(!dl_prio(prio));
743 dl_rq->dl_nr_running++; 743 dl_rq->dl_nr_running++;
744 inc_nr_running(rq_of_dl_rq(dl_rq)); 744 add_nr_running(rq_of_dl_rq(dl_rq), 1);
745 745
746 inc_dl_deadline(dl_rq, deadline); 746 inc_dl_deadline(dl_rq, deadline);
747 inc_dl_migration(dl_se, dl_rq); 747 inc_dl_migration(dl_se, dl_rq);
@@ -755,7 +755,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
755 WARN_ON(!dl_prio(prio)); 755 WARN_ON(!dl_prio(prio));
756 WARN_ON(!dl_rq->dl_nr_running); 756 WARN_ON(!dl_rq->dl_nr_running);
757 dl_rq->dl_nr_running--; 757 dl_rq->dl_nr_running--;
758 dec_nr_running(rq_of_dl_rq(dl_rq)); 758 sub_nr_running(rq_of_dl_rq(dl_rq), 1);
759 759
760 dec_dl_deadline(dl_rq, dl_se->deadline); 760 dec_dl_deadline(dl_rq, dl_se->deadline);
761 dec_dl_migration(dl_se, dl_rq); 761 dec_dl_migration(dl_se, dl_rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0fdb96de81a5..c9617b73bcc0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env,
1095 env->best_cpu = env->dst_cpu; 1095 env->best_cpu = env->dst_cpu;
1096} 1096}
1097 1097
1098static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
1099 long src_load, long dst_load,
1100 struct task_numa_env *env)
1101{
1102 long imb, old_imb;
1103
1104 /* We care about the slope of the imbalance, not the direction. */
1105 if (dst_load < src_load)
1106 swap(dst_load, src_load);
1107
1108 /* Is the difference below the threshold? */
1109 imb = dst_load * 100 - src_load * env->imbalance_pct;
1110 if (imb <= 0)
1111 return false;
1112
1113 /*
1114 * The imbalance is above the allowed threshold.
1115 * Compare it with the old imbalance.
1116 */
1117 if (orig_dst_load < orig_src_load)
1118 swap(orig_dst_load, orig_src_load);
1119
1120 old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
1121
1122 /* Would this change make things worse? */
1123 return (old_imb > imb);
1124}
1125
1098/* 1126/*
1099 * This checks if the overall compute and NUMA accesses of the system would 1127 * This checks if the overall compute and NUMA accesses of the system would
1100 * be improved if the source tasks was migrated to the target dst_cpu taking 1128 * be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env,
1107 struct rq *src_rq = cpu_rq(env->src_cpu); 1135 struct rq *src_rq = cpu_rq(env->src_cpu);
1108 struct rq *dst_rq = cpu_rq(env->dst_cpu); 1136 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1109 struct task_struct *cur; 1137 struct task_struct *cur;
1110 long dst_load, src_load; 1138 long orig_src_load, src_load;
1139 long orig_dst_load, dst_load;
1111 long load; 1140 long load;
1112 long imp = (groupimp > 0) ? groupimp : taskimp; 1141 long imp = (groupimp > 0) ? groupimp : taskimp;
1113 1142
@@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env,
1181 * In the overloaded case, try and keep the load balanced. 1210 * In the overloaded case, try and keep the load balanced.
1182 */ 1211 */
1183balance: 1212balance:
1184 dst_load = env->dst_stats.load; 1213 orig_dst_load = env->dst_stats.load;
1185 src_load = env->src_stats.load; 1214 orig_src_load = env->src_stats.load;
1186 1215
1187 /* XXX missing power terms */ 1216 /* XXX missing power terms */
1188 load = task_h_load(env->p); 1217 load = task_h_load(env->p);
1189 dst_load += load; 1218 dst_load = orig_dst_load + load;
1190 src_load -= load; 1219 src_load = orig_src_load - load;
1191 1220
1192 if (cur) { 1221 if (cur) {
1193 load = task_h_load(cur); 1222 load = task_h_load(cur);
@@ -1195,11 +1224,8 @@ balance:
1195 src_load += load; 1224 src_load += load;
1196 } 1225 }
1197 1226
1198 /* make src_load the smaller */ 1227 if (load_too_imbalanced(orig_src_load, orig_dst_load,
1199 if (dst_load < src_load) 1228 src_load, dst_load, env))
1200 swap(dst_load, src_load);
1201
1202 if (src_load * env->imbalance_pct < dst_load * 100)
1203 goto unlock; 1229 goto unlock;
1204 1230
1205assign: 1231assign:
@@ -1301,7 +1327,16 @@ static int task_numa_migrate(struct task_struct *p)
1301 if (env.best_cpu == -1) 1327 if (env.best_cpu == -1)
1302 return -EAGAIN; 1328 return -EAGAIN;
1303 1329
1304 sched_setnuma(p, env.dst_nid); 1330 /*
1331 * If the task is part of a workload that spans multiple NUMA nodes,
1332 * and is migrating into one of the workload's active nodes, remember
1333 * this node as the task's preferred numa node, so the workload can
1334 * settle down.
1335 * A task that migrated to a second choice node will be better off
1336 * trying for a better one later. Do not set the preferred node here.
1337 */
1338 if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
1339 sched_setnuma(p, env.dst_nid);
1305 1340
1306 /* 1341 /*
1307 * Reset the scan period if the task is being rescheduled on an 1342 * Reset the scan period if the task is being rescheduled on an
@@ -1326,12 +1361,15 @@ static int task_numa_migrate(struct task_struct *p)
1326/* Attempt to migrate a task to a CPU on the preferred node. */ 1361/* Attempt to migrate a task to a CPU on the preferred node. */
1327static void numa_migrate_preferred(struct task_struct *p) 1362static void numa_migrate_preferred(struct task_struct *p)
1328{ 1363{
1364 unsigned long interval = HZ;
1365
1329 /* This task has no NUMA fault statistics yet */ 1366 /* This task has no NUMA fault statistics yet */
1330 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) 1367 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
1331 return; 1368 return;
1332 1369
1333 /* Periodically retry migrating the task to the preferred node */ 1370 /* Periodically retry migrating the task to the preferred node */
1334 p->numa_migrate_retry = jiffies + HZ; 1371 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1372 p->numa_migrate_retry = jiffies + interval;
1335 1373
1336 /* Success if task is already running on preferred CPU */ 1374 /* Success if task is already running on preferred CPU */
1337 if (task_node(p) == p->numa_preferred_nid) 1375 if (task_node(p) == p->numa_preferred_nid)
@@ -1738,6 +1776,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1738 struct task_struct *p = current; 1776 struct task_struct *p = current;
1739 bool migrated = flags & TNF_MIGRATED; 1777 bool migrated = flags & TNF_MIGRATED;
1740 int cpu_node = task_node(current); 1778 int cpu_node = task_node(current);
1779 int local = !!(flags & TNF_FAULT_LOCAL);
1741 int priv; 1780 int priv;
1742 1781
1743 if (!numabalancing_enabled) 1782 if (!numabalancing_enabled)
@@ -1786,6 +1825,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1786 task_numa_group(p, last_cpupid, flags, &priv); 1825 task_numa_group(p, last_cpupid, flags, &priv);
1787 } 1826 }
1788 1827
1828 /*
1829 * If a workload spans multiple NUMA nodes, a shared fault that
1830 * occurs wholly within the set of nodes that the workload is
1831 * actively using should be counted as local. This allows the
1832 * scan rate to slow down when a workload has settled down.
1833 */
1834 if (!priv && !local && p->numa_group &&
1835 node_isset(cpu_node, p->numa_group->active_nodes) &&
1836 node_isset(mem_node, p->numa_group->active_nodes))
1837 local = 1;
1838
1789 task_numa_placement(p); 1839 task_numa_placement(p);
1790 1840
1791 /* 1841 /*
@@ -1800,7 +1850,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1800 1850
1801 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; 1851 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
1802 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; 1852 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
1803 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; 1853 p->numa_faults_locality[local] += pages;
1804} 1854}
1805 1855
1806static void reset_ptenuma_scan(struct task_struct *p) 1856static void reset_ptenuma_scan(struct task_struct *p)
@@ -3301,7 +3351,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3301 } 3351 }
3302 3352
3303 if (!se) 3353 if (!se)
3304 rq->nr_running -= task_delta; 3354 sub_nr_running(rq, task_delta);
3305 3355
3306 cfs_rq->throttled = 1; 3356 cfs_rq->throttled = 1;
3307 cfs_rq->throttled_clock = rq_clock(rq); 3357 cfs_rq->throttled_clock = rq_clock(rq);
@@ -3352,7 +3402,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3352 } 3402 }
3353 3403
3354 if (!se) 3404 if (!se)
3355 rq->nr_running += task_delta; 3405 add_nr_running(rq, task_delta);
3356 3406
3357 /* determine whether we need to wake up potentially idle cpu */ 3407 /* determine whether we need to wake up potentially idle cpu */
3358 if (rq->curr == rq->idle && rq->cfs.nr_running) 3408 if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -3884,7 +3934,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3884 3934
3885 if (!se) { 3935 if (!se) {
3886 update_rq_runnable_avg(rq, rq->nr_running); 3936 update_rq_runnable_avg(rq, rq->nr_running);
3887 inc_nr_running(rq); 3937 add_nr_running(rq, 1);
3888 } 3938 }
3889 hrtick_update(rq); 3939 hrtick_update(rq);
3890} 3940}
@@ -3944,7 +3994,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3944 } 3994 }
3945 3995
3946 if (!se) { 3996 if (!se) {
3947 dec_nr_running(rq); 3997 sub_nr_running(rq, 1);
3948 update_rq_runnable_avg(rq, 1); 3998 update_rq_runnable_avg(rq, 1);
3949 } 3999 }
3950 hrtick_update(rq); 4000 hrtick_update(rq);
@@ -4015,7 +4065,7 @@ static void record_wakee(struct task_struct *p)
4015 * about the loss. 4065 * about the loss.
4016 */ 4066 */
4017 if (jiffies > current->wakee_flip_decay_ts + HZ) { 4067 if (jiffies > current->wakee_flip_decay_ts + HZ) {
4018 current->wakee_flips = 0; 4068 current->wakee_flips >>= 1;
4019 current->wakee_flip_decay_ts = jiffies; 4069 current->wakee_flip_decay_ts = jiffies;
4020 } 4070 }
4021 4071
@@ -4449,10 +4499,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4449 sd = tmp; 4499 sd = tmp;
4450 } 4500 }
4451 4501
4452 if (affine_sd) { 4502 if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
4453 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) 4503 prev_cpu = cpu;
4454 prev_cpu = cpu;
4455 4504
4505 if (sd_flag & SD_BALANCE_WAKE) {
4456 new_cpu = select_idle_sibling(p, prev_cpu); 4506 new_cpu = select_idle_sibling(p, prev_cpu);
4457 goto unlock; 4507 goto unlock;
4458 } 4508 }
@@ -4520,6 +4570,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
4520 atomic_long_add(se->avg.load_avg_contrib, 4570 atomic_long_add(se->avg.load_avg_contrib,
4521 &cfs_rq->removed_load); 4571 &cfs_rq->removed_load);
4522 } 4572 }
4573
4574 /* We have migrated, no longer consider this task hot */
4575 se->exec_start = 0;
4523} 4576}
4524#endif /* CONFIG_SMP */ 4577#endif /* CONFIG_SMP */
4525 4578
@@ -5070,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now)
5070/* Returns true if the destination node has incurred more faults */ 5123/* Returns true if the destination node has incurred more faults */
5071static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) 5124static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5072{ 5125{
5126 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5073 int src_nid, dst_nid; 5127 int src_nid, dst_nid;
5074 5128
5075 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || 5129 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
@@ -5083,21 +5137,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5083 if (src_nid == dst_nid) 5137 if (src_nid == dst_nid)
5084 return false; 5138 return false;
5085 5139
5086 /* Always encourage migration to the preferred node. */ 5140 if (numa_group) {
5087 if (dst_nid == p->numa_preferred_nid) 5141 /* Task is already in the group's interleave set. */
5088 return true; 5142 if (node_isset(src_nid, numa_group->active_nodes))
5143 return false;
5144
5145 /* Task is moving into the group's interleave set. */
5146 if (node_isset(dst_nid, numa_group->active_nodes))
5147 return true;
5089 5148
5090 /* If both task and group weight improve, this move is a winner. */ 5149 return group_faults(p, dst_nid) > group_faults(p, src_nid);
5091 if (task_weight(p, dst_nid) > task_weight(p, src_nid) && 5150 }
5092 group_weight(p, dst_nid) > group_weight(p, src_nid)) 5151
5152 /* Encourage migration to the preferred node. */
5153 if (dst_nid == p->numa_preferred_nid)
5093 return true; 5154 return true;
5094 5155
5095 return false; 5156 return task_faults(p, dst_nid) > task_faults(p, src_nid);
5096} 5157}
5097 5158
5098 5159
5099static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) 5160static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5100{ 5161{
5162 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5101 int src_nid, dst_nid; 5163 int src_nid, dst_nid;
5102 5164
5103 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5165 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5112,16 +5174,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5112 if (src_nid == dst_nid) 5174 if (src_nid == dst_nid)
5113 return false; 5175 return false;
5114 5176
5177 if (numa_group) {
5178 /* Task is moving within/into the group's interleave set. */
5179 if (node_isset(dst_nid, numa_group->active_nodes))
5180 return false;
5181
5182 /* Task is moving out of the group's interleave set. */
5183 if (node_isset(src_nid, numa_group->active_nodes))
5184 return true;
5185
5186 return group_faults(p, dst_nid) < group_faults(p, src_nid);
5187 }
5188
5115 /* Migrating away from the preferred node is always bad. */ 5189 /* Migrating away from the preferred node is always bad. */
5116 if (src_nid == p->numa_preferred_nid) 5190 if (src_nid == p->numa_preferred_nid)
5117 return true; 5191 return true;
5118 5192
5119 /* If either task or group weight get worse, don't do it. */ 5193 return task_faults(p, dst_nid) < task_faults(p, src_nid);
5120 if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
5121 group_weight(p, dst_nid) < group_weight(p, src_nid))
5122 return true;
5123
5124 return false;
5125} 5194}
5126 5195
5127#else 5196#else
@@ -5564,6 +5633,7 @@ static unsigned long scale_rt_power(int cpu)
5564{ 5633{
5565 struct rq *rq = cpu_rq(cpu); 5634 struct rq *rq = cpu_rq(cpu);
5566 u64 total, available, age_stamp, avg; 5635 u64 total, available, age_stamp, avg;
5636 s64 delta;
5567 5637
5568 /* 5638 /*
5569 * Since we're reading these variables without serialization make sure 5639 * Since we're reading these variables without serialization make sure
@@ -5572,7 +5642,11 @@ static unsigned long scale_rt_power(int cpu)
5572 age_stamp = ACCESS_ONCE(rq->age_stamp); 5642 age_stamp = ACCESS_ONCE(rq->age_stamp);
5573 avg = ACCESS_ONCE(rq->rt_avg); 5643 avg = ACCESS_ONCE(rq->rt_avg);
5574 5644
5575 total = sched_avg_period() + (rq_clock(rq) - age_stamp); 5645 delta = rq_clock(rq) - age_stamp;
5646 if (unlikely(delta < 0))
5647 delta = 0;
5648
5649 total = sched_avg_period() + delta;
5576 5650
5577 if (unlikely(total < avg)) { 5651 if (unlikely(total < avg)) {
5578 /* Ensures that power won't end up being negative */ 5652 /* Ensures that power won't end up being negative */
@@ -6640,17 +6714,44 @@ out:
6640 return ld_moved; 6714 return ld_moved;
6641} 6715}
6642 6716
6717static inline unsigned long
6718get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
6719{
6720 unsigned long interval = sd->balance_interval;
6721
6722 if (cpu_busy)
6723 interval *= sd->busy_factor;
6724
6725 /* scale ms to jiffies */
6726 interval = msecs_to_jiffies(interval);
6727 interval = clamp(interval, 1UL, max_load_balance_interval);
6728
6729 return interval;
6730}
6731
6732static inline void
6733update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
6734{
6735 unsigned long interval, next;
6736
6737 interval = get_sd_balance_interval(sd, cpu_busy);
6738 next = sd->last_balance + interval;
6739
6740 if (time_after(*next_balance, next))
6741 *next_balance = next;
6742}
6743
6643/* 6744/*
6644 * idle_balance is called by schedule() if this_cpu is about to become 6745 * idle_balance is called by schedule() if this_cpu is about to become
6645 * idle. Attempts to pull tasks from other CPUs. 6746 * idle. Attempts to pull tasks from other CPUs.
6646 */ 6747 */
6647static int idle_balance(struct rq *this_rq) 6748static int idle_balance(struct rq *this_rq)
6648{ 6749{
6750 unsigned long next_balance = jiffies + HZ;
6751 int this_cpu = this_rq->cpu;
6649 struct sched_domain *sd; 6752 struct sched_domain *sd;
6650 int pulled_task = 0; 6753 int pulled_task = 0;
6651 unsigned long next_balance = jiffies + HZ;
6652 u64 curr_cost = 0; 6754 u64 curr_cost = 0;
6653 int this_cpu = this_rq->cpu;
6654 6755
6655 idle_enter_fair(this_rq); 6756 idle_enter_fair(this_rq);
6656 6757
@@ -6660,8 +6761,15 @@ static int idle_balance(struct rq *this_rq)
6660 */ 6761 */
6661 this_rq->idle_stamp = rq_clock(this_rq); 6762 this_rq->idle_stamp = rq_clock(this_rq);
6662 6763
6663 if (this_rq->avg_idle < sysctl_sched_migration_cost) 6764 if (this_rq->avg_idle < sysctl_sched_migration_cost) {
6765 rcu_read_lock();
6766 sd = rcu_dereference_check_sched_domain(this_rq->sd);
6767 if (sd)
6768 update_next_balance(sd, 0, &next_balance);
6769 rcu_read_unlock();
6770
6664 goto out; 6771 goto out;
6772 }
6665 6773
6666 /* 6774 /*
6667 * Drop the rq->lock, but keep IRQ/preempt disabled. 6775 * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6671,20 +6779,20 @@ static int idle_balance(struct rq *this_rq)
6671 update_blocked_averages(this_cpu); 6779 update_blocked_averages(this_cpu);
6672 rcu_read_lock(); 6780 rcu_read_lock();
6673 for_each_domain(this_cpu, sd) { 6781 for_each_domain(this_cpu, sd) {
6674 unsigned long interval;
6675 int continue_balancing = 1; 6782 int continue_balancing = 1;
6676 u64 t0, domain_cost; 6783 u64 t0, domain_cost;
6677 6784
6678 if (!(sd->flags & SD_LOAD_BALANCE)) 6785 if (!(sd->flags & SD_LOAD_BALANCE))
6679 continue; 6786 continue;
6680 6787
6681 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) 6788 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
6789 update_next_balance(sd, 0, &next_balance);
6682 break; 6790 break;
6791 }
6683 6792
6684 if (sd->flags & SD_BALANCE_NEWIDLE) { 6793 if (sd->flags & SD_BALANCE_NEWIDLE) {
6685 t0 = sched_clock_cpu(this_cpu); 6794 t0 = sched_clock_cpu(this_cpu);
6686 6795
6687 /* If we've pulled tasks over stop searching: */
6688 pulled_task = load_balance(this_cpu, this_rq, 6796 pulled_task = load_balance(this_cpu, this_rq,
6689 sd, CPU_NEWLY_IDLE, 6797 sd, CPU_NEWLY_IDLE,
6690 &continue_balancing); 6798 &continue_balancing);
@@ -6696,10 +6804,13 @@ static int idle_balance(struct rq *this_rq)
6696 curr_cost += domain_cost; 6804 curr_cost += domain_cost;
6697 } 6805 }
6698 6806
6699 interval = msecs_to_jiffies(sd->balance_interval); 6807 update_next_balance(sd, 0, &next_balance);
6700 if (time_after(next_balance, sd->last_balance + interval)) 6808
6701 next_balance = sd->last_balance + interval; 6809 /*
6702 if (pulled_task) 6810 * Stop searching for tasks to pull if there are
6811 * now runnable tasks on this rq.
6812 */
6813 if (pulled_task || this_rq->nr_running > 0)
6703 break; 6814 break;
6704 } 6815 }
6705 rcu_read_unlock(); 6816 rcu_read_unlock();
@@ -6717,20 +6828,13 @@ static int idle_balance(struct rq *this_rq)
6717 if (this_rq->cfs.h_nr_running && !pulled_task) 6828 if (this_rq->cfs.h_nr_running && !pulled_task)
6718 pulled_task = 1; 6829 pulled_task = 1;
6719 6830
6720 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 6831out:
6721 /* 6832 /* Move the next balance forward */
6722 * We are going idle. next_balance may be set based on 6833 if (time_after(this_rq->next_balance, next_balance))
6723 * a busy processor. So reset next_balance.
6724 */
6725 this_rq->next_balance = next_balance; 6834 this_rq->next_balance = next_balance;
6726 }
6727 6835
6728out:
6729 /* Is there a task of a high priority class? */ 6836 /* Is there a task of a high priority class? */
6730 if (this_rq->nr_running != this_rq->cfs.h_nr_running && 6837 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
6731 ((this_rq->stop && this_rq->stop->on_rq) ||
6732 this_rq->dl.dl_nr_running ||
6733 (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
6734 pulled_task = -1; 6838 pulled_task = -1;
6735 6839
6736 if (pulled_task) { 6840 if (pulled_task) {
@@ -7011,16 +7115,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7011 break; 7115 break;
7012 } 7116 }
7013 7117
7014 interval = sd->balance_interval; 7118 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7015 if (idle != CPU_IDLE)
7016 interval *= sd->busy_factor;
7017
7018 /* scale ms to jiffies */
7019 interval = msecs_to_jiffies(interval);
7020 interval = clamp(interval, 1UL, max_load_balance_interval);
7021 7119
7022 need_serialize = sd->flags & SD_SERIALIZE; 7120 need_serialize = sd->flags & SD_SERIALIZE;
7023
7024 if (need_serialize) { 7121 if (need_serialize) {
7025 if (!spin_trylock(&balancing)) 7122 if (!spin_trylock(&balancing))
7026 goto out; 7123 goto out;
@@ -7036,6 +7133,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7036 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; 7133 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
7037 } 7134 }
7038 sd->last_balance = jiffies; 7135 sd->last_balance = jiffies;
7136 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7039 } 7137 }
7040 if (need_serialize) 7138 if (need_serialize)
7041 spin_unlock(&balancing); 7139 spin_unlock(&balancing);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f4390a079c7..25b9423abce9 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -67,24 +67,21 @@ void __weak arch_cpu_idle(void)
67 * cpuidle_idle_call - the main idle function 67 * cpuidle_idle_call - the main idle function
68 * 68 *
69 * NOTE: no locks or semaphores should be used here 69 * NOTE: no locks or semaphores should be used here
70 * return non-zero on failure
71 */ 70 */
72static int cpuidle_idle_call(void) 71static void cpuidle_idle_call(void)
73{ 72{
74 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); 73 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
75 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); 74 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
76 int next_state, entered_state, ret; 75 int next_state, entered_state;
77 bool broadcast; 76 bool broadcast;
78 77
79 /* 78 /*
80 * Check if the idle task must be rescheduled. If it is the 79 * Check if the idle task must be rescheduled. If it is the
81 * case, exit the function after re-enabling the local irq and 80 * case, exit the function after re-enabling the local irq.
82 * set again the polling flag
83 */ 81 */
84 if (current_clr_polling_and_test()) { 82 if (need_resched()) {
85 local_irq_enable(); 83 local_irq_enable();
86 __current_set_polling(); 84 return;
87 return 0;
88 } 85 }
89 86
90 /* 87 /*
@@ -101,96 +98,79 @@ static int cpuidle_idle_call(void)
101 rcu_idle_enter(); 98 rcu_idle_enter();
102 99
103 /* 100 /*
104 * Check if the cpuidle framework is ready, otherwise fallback 101 * Ask the cpuidle framework to choose a convenient idle state.
105 * to the default arch specific idle method 102 * Fall back to the default arch idle method on errors.
106 */ 103 */
107 ret = cpuidle_enabled(drv, dev); 104 next_state = cpuidle_select(drv, dev);
108 105 if (next_state < 0) {
109 if (!ret) { 106use_default:
110 /* 107 /*
111 * Ask the governor to choose an idle state it thinks 108 * We can't use the cpuidle framework, let's use the default
112 * it is convenient to go to. There is *always* a 109 * idle routine.
113 * convenient idle state
114 */ 110 */
115 next_state = cpuidle_select(drv, dev); 111 if (current_clr_polling_and_test())
116
117 /*
118 * The idle task must be scheduled, it is pointless to
119 * go to idle, just update no idle residency and get
120 * out of this function
121 */
122 if (current_clr_polling_and_test()) {
123 dev->last_residency = 0;
124 entered_state = next_state;
125 local_irq_enable(); 112 local_irq_enable();
126 } else { 113 else
127 broadcast = !!(drv->states[next_state].flags & 114 arch_cpu_idle();
128 CPUIDLE_FLAG_TIMER_STOP); 115
129 116 goto exit_idle;
130 if (broadcast)
131 /*
132 * Tell the time framework to switch
133 * to a broadcast timer because our
134 * local timer will be shutdown. If a
135 * local timer is used from another
136 * cpu as a broadcast timer, this call
137 * may fail if it is not available
138 */
139 ret = clockevents_notify(
140 CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
141 &dev->cpu);
142
143 if (!ret) {
144 trace_cpu_idle_rcuidle(next_state, dev->cpu);
145
146 /*
147 * Enter the idle state previously
148 * returned by the governor
149 * decision. This function will block
150 * until an interrupt occurs and will
151 * take care of re-enabling the local
152 * interrupts
153 */
154 entered_state = cpuidle_enter(drv, dev,
155 next_state);
156
157 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT,
158 dev->cpu);
159
160 if (broadcast)
161 clockevents_notify(
162 CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
163 &dev->cpu);
164
165 /*
166 * Give the governor an opportunity to reflect on the
167 * outcome
168 */
169 cpuidle_reflect(dev, entered_state);
170 }
171 }
172 } 117 }
173 118
119
174 /* 120 /*
175 * We can't use the cpuidle framework, let's use the default 121 * The idle task must be scheduled, it is pointless to
176 * idle routine 122 * go to idle, just update no idle residency and get
123 * out of this function
177 */ 124 */
178 if (ret) 125 if (current_clr_polling_and_test()) {
179 arch_cpu_idle(); 126 dev->last_residency = 0;
127 entered_state = next_state;
128 local_irq_enable();
129 goto exit_idle;
130 }
131
132 broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
180 133
134 /*
135 * Tell the time framework to switch to a broadcast timer
136 * because our local timer will be shutdown. If a local timer
137 * is used from another cpu as a broadcast timer, this call may
138 * fail if it is not available
139 */
140 if (broadcast &&
141 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
142 goto use_default;
143
144 trace_cpu_idle_rcuidle(next_state, dev->cpu);
145
146 /*
147 * Enter the idle state previously returned by the governor decision.
148 * This function will block until an interrupt occurs and will take
149 * care of re-enabling the local interrupts
150 */
151 entered_state = cpuidle_enter(drv, dev, next_state);
152
153 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
154
155 if (broadcast)
156 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
157
158 /*
159 * Give the governor an opportunity to reflect on the outcome
160 */
161 cpuidle_reflect(dev, entered_state);
162
163exit_idle:
181 __current_set_polling(); 164 __current_set_polling();
182 165
183 /* 166 /*
184 * It is up to the idle functions to enable back the local 167 * It is up to the idle functions to reenable local interrupts
185 * interrupt
186 */ 168 */
187 if (WARN_ON_ONCE(irqs_disabled())) 169 if (WARN_ON_ONCE(irqs_disabled()))
188 local_irq_enable(); 170 local_irq_enable();
189 171
190 rcu_idle_exit(); 172 rcu_idle_exit();
191 start_critical_timings(); 173 start_critical_timings();
192
193 return 0;
194} 174}
195 175
196/* 176/*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index bd2267ad404f..0ebfd7a29472 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
79 rt_rq->overloaded = 0; 79 rt_rq->overloaded = 0;
80 plist_head_init(&rt_rq->pushable_tasks); 80 plist_head_init(&rt_rq->pushable_tasks);
81#endif 81#endif
82 /* We start is dequeued state, because no RT tasks are queued */
83 rt_rq->rt_queued = 0;
82 84
83 rt_rq->rt_time = 0; 85 rt_rq->rt_time = 0;
84 rt_rq->rt_throttled = 0; 86 rt_rq->rt_throttled = 0;
@@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
112 return rt_se->rt_rq; 114 return rt_se->rt_rq;
113} 115}
114 116
117static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
118{
119 struct rt_rq *rt_rq = rt_se->rt_rq;
120
121 return rt_rq->rq;
122}
123
115void free_rt_sched_group(struct task_group *tg) 124void free_rt_sched_group(struct task_group *tg)
116{ 125{
117 int i; 126 int i;
@@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
211 return container_of(rt_rq, struct rq, rt); 220 return container_of(rt_rq, struct rq, rt);
212} 221}
213 222
214static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 223static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
215{ 224{
216 struct task_struct *p = rt_task_of(rt_se); 225 struct task_struct *p = rt_task_of(rt_se);
217 struct rq *rq = task_rq(p); 226
227 return task_rq(p);
228}
229
230static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
231{
232 struct rq *rq = rq_of_rt_se(rt_se);
218 233
219 return &rq->rt; 234 return &rq->rt;
220} 235}
@@ -391,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq)
391} 406}
392#endif /* CONFIG_SMP */ 407#endif /* CONFIG_SMP */
393 408
409static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
410static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
411
394static inline int on_rt_rq(struct sched_rt_entity *rt_se) 412static inline int on_rt_rq(struct sched_rt_entity *rt_se)
395{ 413{
396 return !list_empty(&rt_se->run_list); 414 return !list_empty(&rt_se->run_list);
@@ -452,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
452 rt_se = rt_rq->tg->rt_se[cpu]; 470 rt_se = rt_rq->tg->rt_se[cpu];
453 471
454 if (rt_rq->rt_nr_running) { 472 if (rt_rq->rt_nr_running) {
455 if (rt_se && !on_rt_rq(rt_se)) 473 if (!rt_se)
474 enqueue_top_rt_rq(rt_rq);
475 else if (!on_rt_rq(rt_se))
456 enqueue_rt_entity(rt_se, false); 476 enqueue_rt_entity(rt_se, false);
477
457 if (rt_rq->highest_prio.curr < curr->prio) 478 if (rt_rq->highest_prio.curr < curr->prio)
458 resched_task(curr); 479 resched_task(curr);
459 } 480 }
@@ -466,10 +487,17 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
466 487
467 rt_se = rt_rq->tg->rt_se[cpu]; 488 rt_se = rt_rq->tg->rt_se[cpu];
468 489
469 if (rt_se && on_rt_rq(rt_se)) 490 if (!rt_se)
491 dequeue_top_rt_rq(rt_rq);
492 else if (on_rt_rq(rt_se))
470 dequeue_rt_entity(rt_se); 493 dequeue_rt_entity(rt_se);
471} 494}
472 495
496static inline int rt_rq_throttled(struct rt_rq *rt_rq)
497{
498 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
499}
500
473static int rt_se_boosted(struct sched_rt_entity *rt_se) 501static int rt_se_boosted(struct sched_rt_entity *rt_se)
474{ 502{
475 struct rt_rq *rt_rq = group_rt_rq(rt_se); 503 struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -532,12 +560,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
532 560
533static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 561static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
534{ 562{
535 if (rt_rq->rt_nr_running) 563 struct rq *rq = rq_of_rt_rq(rt_rq);
536 resched_task(rq_of_rt_rq(rt_rq)->curr); 564
565 if (!rt_rq->rt_nr_running)
566 return;
567
568 enqueue_top_rt_rq(rt_rq);
569 resched_task(rq->curr);
537} 570}
538 571
539static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 572static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
540{ 573{
574 dequeue_top_rt_rq(rt_rq);
575}
576
577static inline int rt_rq_throttled(struct rt_rq *rt_rq)
578{
579 return rt_rq->rt_throttled;
541} 580}
542 581
543static inline const struct cpumask *sched_rt_period_mask(void) 582static inline const struct cpumask *sched_rt_period_mask(void)
@@ -922,6 +961,38 @@ static void update_curr_rt(struct rq *rq)
922 } 961 }
923} 962}
924 963
964static void
965dequeue_top_rt_rq(struct rt_rq *rt_rq)
966{
967 struct rq *rq = rq_of_rt_rq(rt_rq);
968
969 BUG_ON(&rq->rt != rt_rq);
970
971 if (!rt_rq->rt_queued)
972 return;
973
974 BUG_ON(!rq->nr_running);
975
976 sub_nr_running(rq, rt_rq->rt_nr_running);
977 rt_rq->rt_queued = 0;
978}
979
980static void
981enqueue_top_rt_rq(struct rt_rq *rt_rq)
982{
983 struct rq *rq = rq_of_rt_rq(rt_rq);
984
985 BUG_ON(&rq->rt != rt_rq);
986
987 if (rt_rq->rt_queued)
988 return;
989 if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
990 return;
991
992 add_nr_running(rq, rt_rq->rt_nr_running);
993 rt_rq->rt_queued = 1;
994}
995
925#if defined CONFIG_SMP 996#if defined CONFIG_SMP
926 997
927static void 998static void
@@ -1045,12 +1116,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1045#endif /* CONFIG_RT_GROUP_SCHED */ 1116#endif /* CONFIG_RT_GROUP_SCHED */
1046 1117
1047static inline 1118static inline
1119unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1120{
1121 struct rt_rq *group_rq = group_rt_rq(rt_se);
1122
1123 if (group_rq)
1124 return group_rq->rt_nr_running;
1125 else
1126 return 1;
1127}
1128
1129static inline
1048void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1130void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1049{ 1131{
1050 int prio = rt_se_prio(rt_se); 1132 int prio = rt_se_prio(rt_se);
1051 1133
1052 WARN_ON(!rt_prio(prio)); 1134 WARN_ON(!rt_prio(prio));
1053 rt_rq->rt_nr_running++; 1135 rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1054 1136
1055 inc_rt_prio(rt_rq, prio); 1137 inc_rt_prio(rt_rq, prio);
1056 inc_rt_migration(rt_se, rt_rq); 1138 inc_rt_migration(rt_se, rt_rq);
@@ -1062,7 +1144,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1062{ 1144{
1063 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 1145 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1064 WARN_ON(!rt_rq->rt_nr_running); 1146 WARN_ON(!rt_rq->rt_nr_running);
1065 rt_rq->rt_nr_running--; 1147 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1066 1148
1067 dec_rt_prio(rt_rq, rt_se_prio(rt_se)); 1149 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1068 dec_rt_migration(rt_se, rt_rq); 1150 dec_rt_migration(rt_se, rt_rq);
@@ -1119,6 +1201,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
1119 back = rt_se; 1201 back = rt_se;
1120 } 1202 }
1121 1203
1204 dequeue_top_rt_rq(rt_rq_of_se(back));
1205
1122 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1206 for (rt_se = back; rt_se; rt_se = rt_se->back) {
1123 if (on_rt_rq(rt_se)) 1207 if (on_rt_rq(rt_se))
1124 __dequeue_rt_entity(rt_se); 1208 __dequeue_rt_entity(rt_se);
@@ -1127,13 +1211,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
1127 1211
1128static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1212static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1129{ 1213{
1214 struct rq *rq = rq_of_rt_se(rt_se);
1215
1130 dequeue_rt_stack(rt_se); 1216 dequeue_rt_stack(rt_se);
1131 for_each_sched_rt_entity(rt_se) 1217 for_each_sched_rt_entity(rt_se)
1132 __enqueue_rt_entity(rt_se, head); 1218 __enqueue_rt_entity(rt_se, head);
1219 enqueue_top_rt_rq(&rq->rt);
1133} 1220}
1134 1221
1135static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 1222static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
1136{ 1223{
1224 struct rq *rq = rq_of_rt_se(rt_se);
1225
1137 dequeue_rt_stack(rt_se); 1226 dequeue_rt_stack(rt_se);
1138 1227
1139 for_each_sched_rt_entity(rt_se) { 1228 for_each_sched_rt_entity(rt_se) {
@@ -1142,6 +1231,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
1142 if (rt_rq && rt_rq->rt_nr_running) 1231 if (rt_rq && rt_rq->rt_nr_running)
1143 __enqueue_rt_entity(rt_se, false); 1232 __enqueue_rt_entity(rt_se, false);
1144 } 1233 }
1234 enqueue_top_rt_rq(&rq->rt);
1145} 1235}
1146 1236
1147/* 1237/*
@@ -1159,8 +1249,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1159 1249
1160 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1250 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1161 enqueue_pushable_task(rq, p); 1251 enqueue_pushable_task(rq, p);
1162
1163 inc_nr_running(rq);
1164} 1252}
1165 1253
1166static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1254static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1171,8 +1259,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1171 dequeue_rt_entity(rt_se); 1259 dequeue_rt_entity(rt_se);
1172 1260
1173 dequeue_pushable_task(rq, p); 1261 dequeue_pushable_task(rq, p);
1174
1175 dec_nr_running(rq);
1176} 1262}
1177 1263
1178/* 1264/*
@@ -1377,10 +1463,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1377 if (prev->sched_class == &rt_sched_class) 1463 if (prev->sched_class == &rt_sched_class)
1378 update_curr_rt(rq); 1464 update_curr_rt(rq);
1379 1465
1380 if (!rt_rq->rt_nr_running) 1466 if (!rt_rq->rt_queued)
1381 return NULL;
1382
1383 if (rt_rq_throttled(rt_rq))
1384 return NULL; 1467 return NULL;
1385 1468
1386 put_prev_task(rq, prev); 1469 put_prev_task(rq, prev);
@@ -1892,9 +1975,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1892 */ 1975 */
1893 if (p->on_rq && rq->curr != p) { 1976 if (p->on_rq && rq->curr != p) {
1894#ifdef CONFIG_SMP 1977#ifdef CONFIG_SMP
1895 if (rq->rt.overloaded && push_rt_task(rq) && 1978 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
1896 /* Don't resched if we changed runqueues */ 1979 /* Don't resched if we changed runqueues */
1897 rq != task_rq(p)) 1980 push_rt_task(rq) && rq != task_rq(p))
1898 check_resched = 0; 1981 check_resched = 0;
1899#endif /* CONFIG_SMP */ 1982#endif /* CONFIG_SMP */
1900 if (check_resched && p->prio < rq->curr->prio) 1983 if (check_resched && p->prio < rq->curr->prio)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 456e492a3dca..600e2291a75c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -409,6 +409,8 @@ struct rt_rq {
409 int overloaded; 409 int overloaded;
410 struct plist_head pushable_tasks; 410 struct plist_head pushable_tasks;
411#endif 411#endif
412 int rt_queued;
413
412 int rt_throttled; 414 int rt_throttled;
413 u64 rt_time; 415 u64 rt_time;
414 u64 rt_runtime; 416 u64 rt_runtime;
@@ -423,18 +425,6 @@ struct rt_rq {
423#endif 425#endif
424}; 426};
425 427
426#ifdef CONFIG_RT_GROUP_SCHED
427static inline int rt_rq_throttled(struct rt_rq *rt_rq)
428{
429 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
430}
431#else
432static inline int rt_rq_throttled(struct rt_rq *rt_rq)
433{
434 return rt_rq->rt_throttled;
435}
436#endif
437
438/* Deadline class' related fields in a runqueue */ 428/* Deadline class' related fields in a runqueue */
439struct dl_rq { 429struct dl_rq {
440 /* runqueue is an rbtree, ordered by deadline */ 430 /* runqueue is an rbtree, ordered by deadline */
@@ -1216,12 +1206,14 @@ extern void update_idle_cpu_load(struct rq *this_rq);
1216 1206
1217extern void init_task_runnable_average(struct task_struct *p); 1207extern void init_task_runnable_average(struct task_struct *p);
1218 1208
1219static inline void inc_nr_running(struct rq *rq) 1209static inline void add_nr_running(struct rq *rq, unsigned count)
1220{ 1210{
1221 rq->nr_running++; 1211 unsigned prev_nr = rq->nr_running;
1212
1213 rq->nr_running = prev_nr + count;
1222 1214
1223#ifdef CONFIG_NO_HZ_FULL 1215#ifdef CONFIG_NO_HZ_FULL
1224 if (rq->nr_running == 2) { 1216 if (prev_nr < 2 && rq->nr_running >= 2) {
1225 if (tick_nohz_full_cpu(rq->cpu)) { 1217 if (tick_nohz_full_cpu(rq->cpu)) {
1226 /* Order rq->nr_running write against the IPI */ 1218 /* Order rq->nr_running write against the IPI */
1227 smp_wmb(); 1219 smp_wmb();
@@ -1231,9 +1223,9 @@ static inline void inc_nr_running(struct rq *rq)
1231#endif 1223#endif
1232} 1224}
1233 1225
1234static inline void dec_nr_running(struct rq *rq) 1226static inline void sub_nr_running(struct rq *rq, unsigned count)
1235{ 1227{
1236 rq->nr_running--; 1228 rq->nr_running -= count;
1237} 1229}
1238 1230
1239static inline void rq_last_tick_reset(struct rq *rq) 1231static inline void rq_last_tick_reset(struct rq *rq)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index d6ce65dde541..bfe0edadbfbb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
41static void 41static void
42enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) 42enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
43{ 43{
44 inc_nr_running(rq); 44 add_nr_running(rq, 1);
45} 45}
46 46
47static void 47static void
48dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) 48dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
49{ 49{
50 dec_nr_running(rq); 50 sub_nr_running(rq, 1);
51} 51}
52 52
53static void yield_task_stop(struct rq *rq) 53static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sys.c b/kernel/sys.c
index fba0f29401ea..66a751ebf9d9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
250 else 250 else
251 p = current; 251 p = current;
252 if (p) { 252 if (p) {
253 niceval = 20 - task_nice(p); 253 niceval = nice_to_rlimit(task_nice(p));
254 if (niceval > retval) 254 if (niceval > retval)
255 retval = niceval; 255 retval = niceval;
256 } 256 }
@@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
261 else 261 else
262 pgrp = task_pgrp(current); 262 pgrp = task_pgrp(current);
263 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 263 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
264 niceval = 20 - task_nice(p); 264 niceval = nice_to_rlimit(task_nice(p));
265 if (niceval > retval) 265 if (niceval > retval)
266 retval = niceval; 266 retval = niceval;
267 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 267 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
@@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
277 277
278 do_each_thread(g, p) { 278 do_each_thread(g, p) {
279 if (uid_eq(task_uid(p), uid)) { 279 if (uid_eq(task_uid(p), uid)) {
280 niceval = 20 - task_nice(p); 280 niceval = nice_to_rlimit(task_nice(p));
281 if (niceval > retval) 281 if (niceval > retval)
282 retval = niceval; 282 retval = niceval;
283 } 283 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8edc87185427..a4bab46cd38e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -100,10 +100,10 @@ enum {
100 100
101 /* 101 /*
102 * Rescue workers are used only on emergencies and shared by 102 * Rescue workers are used only on emergencies and shared by
103 * all cpus. Give -20. 103 * all cpus. Give MIN_NICE.
104 */ 104 */
105 RESCUER_NICE_LEVEL = -20, 105 RESCUER_NICE_LEVEL = MIN_NICE,
106 HIGHPRI_NICE_LEVEL = -20, 106 HIGHPRI_NICE_LEVEL = MIN_NICE,
107 107
108 WQ_NAME_LEN = 24, 108 WQ_NAME_LEN = 24,
109}; 109};
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b4b1feba6472..d199d2d91946 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2740,7 +2740,7 @@ static int khugepaged(void *none)
2740 struct mm_slot *mm_slot; 2740 struct mm_slot *mm_slot;
2741 2741
2742 set_freezable(); 2742 set_freezable();
2743 set_user_nice(current, 19); 2743 set_user_nice(current, MAX_NICE);
2744 2744
2745 while (!kthread_should_stop()) { 2745 while (!kthread_should_stop()) {
2746 khugepaged_do_scan(); 2746 khugepaged_do_scan();
diff --git a/mm/memory.c b/mm/memory.c
index 037b812a9531..e302ae1dcce0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3920,9 +3920,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3920 } 3920 }
3921 } 3921 }
3922 3922
3923 /* THP should already have been handled */
3924 BUG_ON(pmd_numa(*pmd));
3925
3926 /* 3923 /*
3927 * Use __pte_alloc instead of pte_alloc_map, because we can't 3924 * Use __pte_alloc instead of pte_alloc_map, because we can't
3928 * run pte_offset_map on the pmd, if an huge pmd could 3925 * run pte_offset_map on the pmd, if an huge pmd could