aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-10-21 15:55:43 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-10-21 15:55:43 -0400
commitbc4016f48161454a9a8e5eb209b0693c6cde9f62 (patch)
treef470f5d711e975b152eec90282f5dd30a1d5dba5
parent5d70f79b5ef6ea2de4f72a37b2d96e2601e40a22 (diff)
parentb7dadc38797584f6203386da1947ed5edf516646 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (29 commits) sched: Export account_system_vtime() sched: Call tick_check_idle before __irq_enter sched: Remove irq time from available CPU power sched: Do not account irq time to current task x86: Add IRQ_TIME_ACCOUNTING sched: Add IRQ_TIME_ACCOUNTING, finer accounting of irq time sched: Add a PF flag for ksoftirqd identification sched: Consolidate account_system_vtime extern declaration sched: Fix softirq time accounting sched: Drop group_capacity to 1 only if local group has extra capacity sched: Force balancing on newidle balance if local group has capacity sched: Set group_imb only a task can be pulled from the busiest cpu sched: Do not consider SCHED_IDLE tasks to be cache hot sched: Drop all load weight manipulation for RT tasks sched: Create special class for stop/migrate work sched: Unindent labels sched: Comment updates: fix default latency and granularity numbers tracing/sched: Add sched_pi_setprio tracepoint sched: Give CPU bound RT tasks preference sched: Try not to migrate higher priority RT tasks ...
-rw-r--r--Documentation/cputopology.txt23
-rw-r--r--Documentation/kernel-parameters.txt4
-rw-r--r--arch/ia64/include/asm/system.h4
-rw-r--r--arch/powerpc/include/asm/system.h4
-rw-r--r--arch/s390/Kconfig7
-rw-r--r--arch/s390/include/asm/system.h1
-rw-r--r--arch/s390/include/asm/topology.h27
-rw-r--r--arch/s390/kernel/topology.c150
-rw-r--r--arch/x86/Kconfig11
-rw-r--r--arch/x86/kernel/tsc.c8
-rw-r--r--drivers/base/topology.c16
-rw-r--r--include/linux/hardirq.h9
-rw-r--r--include/linux/sched.h23
-rw-r--r--include/linux/topology.h6
-rw-r--r--include/trace/events/sched.h29
-rw-r--r--kernel/sched.c291
-rw-r--r--kernel/sched_fair.c76
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_rt.c40
-rw-r--r--kernel/sched_stoptask.c108
-rw-r--r--kernel/softirq.c64
-rw-r--r--kernel/stop_machine.c8
-rw-r--r--net/sched/cls_cgroup.c2
23 files changed, 732 insertions, 184 deletions
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index f1c5c4bccd3e..902d3151f527 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -14,25 +14,39 @@ to /proc/cpuinfo.
14 identifier (rather than the kernel's). The actual value is 14 identifier (rather than the kernel's). The actual value is
15 architecture and platform dependent. 15 architecture and platform dependent.
16 16
173) /sys/devices/system/cpu/cpuX/topology/thread_siblings: 173) /sys/devices/system/cpu/cpuX/topology/book_id:
18
19 the book ID of cpuX. Typically it is the hardware platform's
20 identifier (rather than the kernel's). The actual value is
21 architecture and platform dependent.
22
234) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
18 24
19 internel kernel map of cpuX's hardware threads within the same 25 internel kernel map of cpuX's hardware threads within the same
20 core as cpuX 26 core as cpuX
21 27
224) /sys/devices/system/cpu/cpuX/topology/core_siblings: 285) /sys/devices/system/cpu/cpuX/topology/core_siblings:
23 29
24 internal kernel map of cpuX's hardware threads within the same 30 internal kernel map of cpuX's hardware threads within the same
25 physical_package_id. 31 physical_package_id.
26 32
336) /sys/devices/system/cpu/cpuX/topology/book_siblings:
34
35 internal kernel map of cpuX's hardware threads within the same
36 book_id.
37
27To implement it in an architecture-neutral way, a new source file, 38To implement it in an architecture-neutral way, a new source file,
28drivers/base/topology.c, is to export the 4 attributes. 39drivers/base/topology.c, is to export the 4 or 6 attributes. The two book
40related sysfs files will only be created if CONFIG_SCHED_BOOK is selected.
29 41
30For an architecture to support this feature, it must define some of 42For an architecture to support this feature, it must define some of
31these macros in include/asm-XXX/topology.h: 43these macros in include/asm-XXX/topology.h:
32#define topology_physical_package_id(cpu) 44#define topology_physical_package_id(cpu)
33#define topology_core_id(cpu) 45#define topology_core_id(cpu)
46#define topology_book_id(cpu)
34#define topology_thread_cpumask(cpu) 47#define topology_thread_cpumask(cpu)
35#define topology_core_cpumask(cpu) 48#define topology_core_cpumask(cpu)
49#define topology_book_cpumask(cpu)
36 50
37The type of **_id is int. 51The type of **_id is int.
38The type of siblings is (const) struct cpumask *. 52The type of siblings is (const) struct cpumask *.
@@ -45,6 +59,9 @@ not defined by include/asm-XXX/topology.h:
453) thread_siblings: just the given CPU 593) thread_siblings: just the given CPU
464) core_siblings: just the given CPU 604) core_siblings: just the given CPU
47 61
62For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
63default definitions for topology_book_id() and topology_book_cpumask().
64
48Additionally, CPU topology information is provided under 65Additionally, CPU topology information is provided under
49/sys/devices/system/cpu and includes these files. The internal 66/sys/devices/system/cpu and includes these files. The internal
50source for the output is in brackets ("[]"). 67source for the output is in brackets ("[]").
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8dd7248508a9..ed05a4a0d242 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2435,6 +2435,10 @@ and is between 256 and 4096 characters. It is defined in the file
2435 disables clocksource verification at runtime. 2435 disables clocksource verification at runtime.
2436 Used to enable high-resolution timer mode on older 2436 Used to enable high-resolution timer mode on older
2437 hardware, and in virtualized environment. 2437 hardware, and in virtualized environment.
2438 [x86] noirqtime: Do not use TSC to do irq accounting.
2439 Used to run time disable IRQ_TIME_ACCOUNTING on any
2440 platforms where RDTSC is slow and this accounting
2441 can add overhead.
2438 2442
2439 turbografx.map[2|3]= [HW,JOY] 2443 turbografx.map[2|3]= [HW,JOY]
2440 TurboGraFX parallel port interface 2444 TurboGraFX parallel port interface
diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h
index 9f342a574ce8..dd028f2b13b3 100644
--- a/arch/ia64/include/asm/system.h
+++ b/arch/ia64/include/asm/system.h
@@ -272,10 +272,6 @@ void cpu_idle_wait(void);
272 272
273void default_idle(void); 273void default_idle(void);
274 274
275#ifdef CONFIG_VIRT_CPU_ACCOUNTING
276extern void account_system_vtime(struct task_struct *);
277#endif
278
279#endif /* __KERNEL__ */ 275#endif /* __KERNEL__ */
280 276
281#endif /* __ASSEMBLY__ */ 277#endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h
index 6c294acac848..9c3d160670b4 100644
--- a/arch/powerpc/include/asm/system.h
+++ b/arch/powerpc/include/asm/system.h
@@ -542,10 +542,6 @@ extern void reloc_got2(unsigned long);
542 542
543#define PTRRELOC(x) ((typeof(x)) add_reloc_offset((unsigned long)(x))) 543#define PTRRELOC(x) ((typeof(x)) add_reloc_offset((unsigned long)(x)))
544 544
545#ifdef CONFIG_VIRT_CPU_ACCOUNTING
546extern void account_system_vtime(struct task_struct *);
547#endif
548
549extern struct dentry *powerpc_debugfs_root; 545extern struct dentry *powerpc_debugfs_root;
550 546
551#endif /* __KERNEL__ */ 547#endif /* __KERNEL__ */
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 958f0dadeadf..75976a141947 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -199,6 +199,13 @@ config HOTPLUG_CPU
199 can be controlled through /sys/devices/system/cpu/cpu#. 199 can be controlled through /sys/devices/system/cpu/cpu#.
200 Say N if you want to disable CPU hotplug. 200 Say N if you want to disable CPU hotplug.
201 201
202config SCHED_BOOK
203 bool "Book scheduler support"
204 depends on SMP
205 help
206 Book scheduler support improves the CPU scheduler's decision making
207 when dealing with machines that have several books.
208
202config MATHEMU 209config MATHEMU
203 bool "IEEE FPU emulation" 210 bool "IEEE FPU emulation"
204 depends on MARCH_G5 211 depends on MARCH_G5
diff --git a/arch/s390/include/asm/system.h b/arch/s390/include/asm/system.h
index cef66210c846..38ddd8a9a9e8 100644
--- a/arch/s390/include/asm/system.h
+++ b/arch/s390/include/asm/system.h
@@ -97,7 +97,6 @@ static inline void restore_access_regs(unsigned int *acrs)
97 97
98extern void account_vtime(struct task_struct *, struct task_struct *); 98extern void account_vtime(struct task_struct *, struct task_struct *);
99extern void account_tick_vtime(struct task_struct *); 99extern void account_tick_vtime(struct task_struct *);
100extern void account_system_vtime(struct task_struct *);
101 100
102#ifdef CONFIG_PFAULT 101#ifdef CONFIG_PFAULT
103extern void pfault_irq_init(void); 102extern void pfault_irq_init(void);
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index 831bd033ea77..051107a2c5e2 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -3,15 +3,32 @@
3 3
4#include <linux/cpumask.h> 4#include <linux/cpumask.h>
5 5
6#define mc_capable() (1)
7
8const struct cpumask *cpu_coregroup_mask(unsigned int cpu);
9
10extern unsigned char cpu_core_id[NR_CPUS]; 6extern unsigned char cpu_core_id[NR_CPUS];
11extern cpumask_t cpu_core_map[NR_CPUS]; 7extern cpumask_t cpu_core_map[NR_CPUS];
12 8
9static inline const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
10{
11 return &cpu_core_map[cpu];
12}
13
13#define topology_core_id(cpu) (cpu_core_id[cpu]) 14#define topology_core_id(cpu) (cpu_core_id[cpu])
14#define topology_core_cpumask(cpu) (&cpu_core_map[cpu]) 15#define topology_core_cpumask(cpu) (&cpu_core_map[cpu])
16#define mc_capable() (1)
17
18#ifdef CONFIG_SCHED_BOOK
19
20extern unsigned char cpu_book_id[NR_CPUS];
21extern cpumask_t cpu_book_map[NR_CPUS];
22
23static inline const struct cpumask *cpu_book_mask(unsigned int cpu)
24{
25 return &cpu_book_map[cpu];
26}
27
28#define topology_book_id(cpu) (cpu_book_id[cpu])
29#define topology_book_cpumask(cpu) (&cpu_book_map[cpu])
30
31#endif /* CONFIG_SCHED_BOOK */
15 32
16int topology_set_cpu_management(int fc); 33int topology_set_cpu_management(int fc);
17void topology_schedule_update(void); 34void topology_schedule_update(void);
@@ -30,6 +47,8 @@ static inline void s390_init_cpu_topology(void)
30}; 47};
31#endif 48#endif
32 49
50#define SD_BOOK_INIT SD_CPU_INIT
51
33#include <asm-generic/topology.h> 52#include <asm-generic/topology.h>
34 53
35#endif /* _ASM_S390_TOPOLOGY_H */ 54#endif /* _ASM_S390_TOPOLOGY_H */
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index bcef00766a64..13559c993847 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -57,8 +57,8 @@ struct tl_info {
57 union tl_entry tle[0]; 57 union tl_entry tle[0];
58}; 58};
59 59
60struct core_info { 60struct mask_info {
61 struct core_info *next; 61 struct mask_info *next;
62 unsigned char id; 62 unsigned char id;
63 cpumask_t mask; 63 cpumask_t mask;
64}; 64};
@@ -66,7 +66,6 @@ struct core_info {
66static int topology_enabled; 66static int topology_enabled;
67static void topology_work_fn(struct work_struct *work); 67static void topology_work_fn(struct work_struct *work);
68static struct tl_info *tl_info; 68static struct tl_info *tl_info;
69static struct core_info core_info;
70static int machine_has_topology; 69static int machine_has_topology;
71static struct timer_list topology_timer; 70static struct timer_list topology_timer;
72static void set_topology_timer(void); 71static void set_topology_timer(void);
@@ -74,38 +73,37 @@ static DECLARE_WORK(topology_work, topology_work_fn);
74/* topology_lock protects the core linked list */ 73/* topology_lock protects the core linked list */
75static DEFINE_SPINLOCK(topology_lock); 74static DEFINE_SPINLOCK(topology_lock);
76 75
76static struct mask_info core_info;
77cpumask_t cpu_core_map[NR_CPUS]; 77cpumask_t cpu_core_map[NR_CPUS];
78unsigned char cpu_core_id[NR_CPUS]; 78unsigned char cpu_core_id[NR_CPUS];
79 79
80static cpumask_t cpu_coregroup_map(unsigned int cpu) 80#ifdef CONFIG_SCHED_BOOK
81static struct mask_info book_info;
82cpumask_t cpu_book_map[NR_CPUS];
83unsigned char cpu_book_id[NR_CPUS];
84#endif
85
86static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu)
81{ 87{
82 struct core_info *core = &core_info;
83 unsigned long flags;
84 cpumask_t mask; 88 cpumask_t mask;
85 89
86 cpus_clear(mask); 90 cpus_clear(mask);
87 if (!topology_enabled || !machine_has_topology) 91 if (!topology_enabled || !machine_has_topology)
88 return cpu_possible_map; 92 return cpu_possible_map;
89 spin_lock_irqsave(&topology_lock, flags); 93 while (info) {
90 while (core) { 94 if (cpu_isset(cpu, info->mask)) {
91 if (cpu_isset(cpu, core->mask)) { 95 mask = info->mask;
92 mask = core->mask;
93 break; 96 break;
94 } 97 }
95 core = core->next; 98 info = info->next;
96 } 99 }
97 spin_unlock_irqrestore(&topology_lock, flags);
98 if (cpus_empty(mask)) 100 if (cpus_empty(mask))
99 mask = cpumask_of_cpu(cpu); 101 mask = cpumask_of_cpu(cpu);
100 return mask; 102 return mask;
101} 103}
102 104
103const struct cpumask *cpu_coregroup_mask(unsigned int cpu) 105static void add_cpus_to_mask(struct tl_cpu *tl_cpu, struct mask_info *book,
104{ 106 struct mask_info *core)
105 return &cpu_core_map[cpu];
106}
107
108static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
109{ 107{
110 unsigned int cpu; 108 unsigned int cpu;
111 109
@@ -117,23 +115,35 @@ static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
117 115
118 rcpu = CPU_BITS - 1 - cpu + tl_cpu->origin; 116 rcpu = CPU_BITS - 1 - cpu + tl_cpu->origin;
119 for_each_present_cpu(lcpu) { 117 for_each_present_cpu(lcpu) {
120 if (cpu_logical_map(lcpu) == rcpu) { 118 if (cpu_logical_map(lcpu) != rcpu)
121 cpu_set(lcpu, core->mask); 119 continue;
122 cpu_core_id[lcpu] = core->id; 120#ifdef CONFIG_SCHED_BOOK
123 smp_cpu_polarization[lcpu] = tl_cpu->pp; 121 cpu_set(lcpu, book->mask);
124 } 122 cpu_book_id[lcpu] = book->id;
123#endif
124 cpu_set(lcpu, core->mask);
125 cpu_core_id[lcpu] = core->id;
126 smp_cpu_polarization[lcpu] = tl_cpu->pp;
125 } 127 }
126 } 128 }
127} 129}
128 130
129static void clear_cores(void) 131static void clear_masks(void)
130{ 132{
131 struct core_info *core = &core_info; 133 struct mask_info *info;
132 134
133 while (core) { 135 info = &core_info;
134 cpus_clear(core->mask); 136 while (info) {
135 core = core->next; 137 cpus_clear(info->mask);
138 info = info->next;
139 }
140#ifdef CONFIG_SCHED_BOOK
141 info = &book_info;
142 while (info) {
143 cpus_clear(info->mask);
144 info = info->next;
136 } 145 }
146#endif
137} 147}
138 148
139static union tl_entry *next_tle(union tl_entry *tle) 149static union tl_entry *next_tle(union tl_entry *tle)
@@ -146,29 +156,36 @@ static union tl_entry *next_tle(union tl_entry *tle)
146 156
147static void tl_to_cores(struct tl_info *info) 157static void tl_to_cores(struct tl_info *info)
148{ 158{
159#ifdef CONFIG_SCHED_BOOK
160 struct mask_info *book = &book_info;
161#else
162 struct mask_info *book = NULL;
163#endif
164 struct mask_info *core = &core_info;
149 union tl_entry *tle, *end; 165 union tl_entry *tle, *end;
150 struct core_info *core = &core_info; 166
151 167
152 spin_lock_irq(&topology_lock); 168 spin_lock_irq(&topology_lock);
153 clear_cores(); 169 clear_masks();
154 tle = info->tle; 170 tle = info->tle;
155 end = (union tl_entry *)((unsigned long)info + info->length); 171 end = (union tl_entry *)((unsigned long)info + info->length);
156 while (tle < end) { 172 while (tle < end) {
157 switch (tle->nl) { 173 switch (tle->nl) {
158 case 5: 174#ifdef CONFIG_SCHED_BOOK
159 case 4:
160 case 3:
161 case 2: 175 case 2:
176 book = book->next;
177 book->id = tle->container.id;
162 break; 178 break;
179#endif
163 case 1: 180 case 1:
164 core = core->next; 181 core = core->next;
165 core->id = tle->container.id; 182 core->id = tle->container.id;
166 break; 183 break;
167 case 0: 184 case 0:
168 add_cpus_to_core(&tle->cpu, core); 185 add_cpus_to_mask(&tle->cpu, book, core);
169 break; 186 break;
170 default: 187 default:
171 clear_cores(); 188 clear_masks();
172 machine_has_topology = 0; 189 machine_has_topology = 0;
173 goto out; 190 goto out;
174 } 191 }
@@ -221,10 +238,29 @@ int topology_set_cpu_management(int fc)
221 238
222static void update_cpu_core_map(void) 239static void update_cpu_core_map(void)
223{ 240{
241 unsigned long flags;
224 int cpu; 242 int cpu;
225 243
226 for_each_possible_cpu(cpu) 244 spin_lock_irqsave(&topology_lock, flags);
227 cpu_core_map[cpu] = cpu_coregroup_map(cpu); 245 for_each_possible_cpu(cpu) {
246 cpu_core_map[cpu] = cpu_group_map(&core_info, cpu);
247#ifdef CONFIG_SCHED_BOOK
248 cpu_book_map[cpu] = cpu_group_map(&book_info, cpu);
249#endif
250 }
251 spin_unlock_irqrestore(&topology_lock, flags);
252}
253
254static void store_topology(struct tl_info *info)
255{
256#ifdef CONFIG_SCHED_BOOK
257 int rc;
258
259 rc = stsi(info, 15, 1, 3);
260 if (rc != -ENOSYS)
261 return;
262#endif
263 stsi(info, 15, 1, 2);
228} 264}
229 265
230int arch_update_cpu_topology(void) 266int arch_update_cpu_topology(void)
@@ -238,7 +274,7 @@ int arch_update_cpu_topology(void)
238 topology_update_polarization_simple(); 274 topology_update_polarization_simple();
239 return 0; 275 return 0;
240 } 276 }
241 stsi(info, 15, 1, 2); 277 store_topology(info);
242 tl_to_cores(info); 278 tl_to_cores(info);
243 update_cpu_core_map(); 279 update_cpu_core_map();
244 for_each_online_cpu(cpu) { 280 for_each_online_cpu(cpu) {
@@ -299,12 +335,24 @@ out:
299} 335}
300__initcall(init_topology_update); 336__initcall(init_topology_update);
301 337
338static void alloc_masks(struct tl_info *info, struct mask_info *mask, int offset)
339{
340 int i, nr_masks;
341
342 nr_masks = info->mag[NR_MAG - offset];
343 for (i = 0; i < info->mnest - offset; i++)
344 nr_masks *= info->mag[NR_MAG - offset - 1 - i];
345 nr_masks = max(nr_masks, 1);
346 for (i = 0; i < nr_masks; i++) {
347 mask->next = alloc_bootmem(sizeof(struct mask_info));
348 mask = mask->next;
349 }
350}
351
302void __init s390_init_cpu_topology(void) 352void __init s390_init_cpu_topology(void)
303{ 353{
304 unsigned long long facility_bits; 354 unsigned long long facility_bits;
305 struct tl_info *info; 355 struct tl_info *info;
306 struct core_info *core;
307 int nr_cores;
308 int i; 356 int i;
309 357
310 if (stfle(&facility_bits, 1) <= 0) 358 if (stfle(&facility_bits, 1) <= 0)
@@ -315,25 +363,13 @@ void __init s390_init_cpu_topology(void)
315 363
316 tl_info = alloc_bootmem_pages(PAGE_SIZE); 364 tl_info = alloc_bootmem_pages(PAGE_SIZE);
317 info = tl_info; 365 info = tl_info;
318 stsi(info, 15, 1, 2); 366 store_topology(info);
319
320 nr_cores = info->mag[NR_MAG - 2];
321 for (i = 0; i < info->mnest - 2; i++)
322 nr_cores *= info->mag[NR_MAG - 3 - i];
323
324 pr_info("The CPU configuration topology of the machine is:"); 367 pr_info("The CPU configuration topology of the machine is:");
325 for (i = 0; i < NR_MAG; i++) 368 for (i = 0; i < NR_MAG; i++)
326 printk(" %d", info->mag[i]); 369 printk(" %d", info->mag[i]);
327 printk(" / %d\n", info->mnest); 370 printk(" / %d\n", info->mnest);
328 371 alloc_masks(info, &core_info, 2);
329 core = &core_info; 372#ifdef CONFIG_SCHED_BOOK
330 for (i = 0; i < nr_cores; i++) { 373 alloc_masks(info, &book_info, 3);
331 core->next = alloc_bootmem(sizeof(struct core_info)); 374#endif
332 core = core->next;
333 if (!core)
334 goto error;
335 }
336 return;
337error:
338 machine_has_topology = 0;
339} 375}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fd227d6b8d9c..89b88e3a56e9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -799,6 +799,17 @@ config SCHED_MC
799 making when dealing with multi-core CPU chips at a cost of slightly 799 making when dealing with multi-core CPU chips at a cost of slightly
800 increased overhead in some places. If unsure say N here. 800 increased overhead in some places. If unsure say N here.
801 801
802config IRQ_TIME_ACCOUNTING
803 bool "Fine granularity task level IRQ time accounting"
804 default n
805 ---help---
806 Select this option to enable fine granularity task irq time
807 accounting. This is done by reading a timestamp on each
808 transitions between softirq and hardirq state, so there can be a
809 small performance impact.
810
811 If in doubt, say N here.
812
802source "kernel/Kconfig.preempt" 813source "kernel/Kconfig.preempt"
803 814
804config X86_UP_APIC 815config X86_UP_APIC
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 26a863a9c2a8..a1c2cd768538 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,10 +104,14 @@ int __init notsc_setup(char *str)
104 104
105__setup("notsc", notsc_setup); 105__setup("notsc", notsc_setup);
106 106
107static int no_sched_irq_time;
108
107static int __init tsc_setup(char *str) 109static int __init tsc_setup(char *str)
108{ 110{
109 if (!strcmp(str, "reliable")) 111 if (!strcmp(str, "reliable"))
110 tsc_clocksource_reliable = 1; 112 tsc_clocksource_reliable = 1;
113 if (!strncmp(str, "noirqtime", 9))
114 no_sched_irq_time = 1;
111 return 1; 115 return 1;
112} 116}
113 117
@@ -801,6 +805,7 @@ void mark_tsc_unstable(char *reason)
801 if (!tsc_unstable) { 805 if (!tsc_unstable) {
802 tsc_unstable = 1; 806 tsc_unstable = 1;
803 sched_clock_stable = 0; 807 sched_clock_stable = 0;
808 disable_sched_clock_irqtime();
804 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); 809 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
805 /* Change only the rating, when not registered */ 810 /* Change only the rating, when not registered */
806 if (clocksource_tsc.mult) 811 if (clocksource_tsc.mult)
@@ -987,6 +992,9 @@ void __init tsc_init(void)
987 /* now allow native_sched_clock() to use rdtsc */ 992 /* now allow native_sched_clock() to use rdtsc */
988 tsc_disabled = 0; 993 tsc_disabled = 0;
989 994
995 if (!no_sched_irq_time)
996 enable_sched_clock_irqtime();
997
990 lpj = ((u64)tsc_khz * 1000); 998 lpj = ((u64)tsc_khz * 1000);
991 do_div(lpj, HZ); 999 do_div(lpj, HZ);
992 lpj_fine = lpj; 1000 lpj_fine = lpj;
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 9fc630ce1ddb..f6f37a05a0c3 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -45,7 +45,8 @@ static ssize_t show_##name(struct sys_device *dev, \
45 return sprintf(buf, "%d\n", topology_##name(cpu)); \ 45 return sprintf(buf, "%d\n", topology_##name(cpu)); \
46} 46}
47 47
48#if defined(topology_thread_cpumask) || defined(topology_core_cpumask) 48#if defined(topology_thread_cpumask) || defined(topology_core_cpumask) || \
49 defined(topology_book_cpumask)
49static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf) 50static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf)
50{ 51{
51 ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; 52 ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
@@ -114,6 +115,14 @@ define_siblings_show_func(core_cpumask);
114define_one_ro_named(core_siblings, show_core_cpumask); 115define_one_ro_named(core_siblings, show_core_cpumask);
115define_one_ro_named(core_siblings_list, show_core_cpumask_list); 116define_one_ro_named(core_siblings_list, show_core_cpumask_list);
116 117
118#ifdef CONFIG_SCHED_BOOK
119define_id_show_func(book_id);
120define_one_ro(book_id);
121define_siblings_show_func(book_cpumask);
122define_one_ro_named(book_siblings, show_book_cpumask);
123define_one_ro_named(book_siblings_list, show_book_cpumask_list);
124#endif
125
117static struct attribute *default_attrs[] = { 126static struct attribute *default_attrs[] = {
118 &attr_physical_package_id.attr, 127 &attr_physical_package_id.attr,
119 &attr_core_id.attr, 128 &attr_core_id.attr,
@@ -121,6 +130,11 @@ static struct attribute *default_attrs[] = {
121 &attr_thread_siblings_list.attr, 130 &attr_thread_siblings_list.attr,
122 &attr_core_siblings.attr, 131 &attr_core_siblings.attr,
123 &attr_core_siblings_list.attr, 132 &attr_core_siblings_list.attr,
133#ifdef CONFIG_SCHED_BOOK
134 &attr_book_id.attr,
135 &attr_book_siblings.attr,
136 &attr_book_siblings_list.attr,
137#endif
124 NULL 138 NULL
125}; 139};
126 140
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 1f4517d55b19..96c323ac44df 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -64,6 +64,8 @@
64#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) 64#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
65#define NMI_OFFSET (1UL << NMI_SHIFT) 65#define NMI_OFFSET (1UL << NMI_SHIFT)
66 66
67#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
68
67#ifndef PREEMPT_ACTIVE 69#ifndef PREEMPT_ACTIVE
68#define PREEMPT_ACTIVE_BITS 1 70#define PREEMPT_ACTIVE_BITS 1
69#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) 71#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
@@ -82,10 +84,13 @@
82/* 84/*
83 * Are we doing bottom half or hardware interrupt processing? 85 * Are we doing bottom half or hardware interrupt processing?
84 * Are we in a softirq context? Interrupt context? 86 * Are we in a softirq context? Interrupt context?
87 * in_softirq - Are we currently processing softirq or have bh disabled?
88 * in_serving_softirq - Are we currently processing softirq?
85 */ 89 */
86#define in_irq() (hardirq_count()) 90#define in_irq() (hardirq_count())
87#define in_softirq() (softirq_count()) 91#define in_softirq() (softirq_count())
88#define in_interrupt() (irq_count()) 92#define in_interrupt() (irq_count())
93#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
89 94
90/* 95/*
91 * Are we in NMI context? 96 * Are we in NMI context?
@@ -132,10 +137,12 @@ extern void synchronize_irq(unsigned int irq);
132 137
133struct task_struct; 138struct task_struct;
134 139
135#ifndef CONFIG_VIRT_CPU_ACCOUNTING 140#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING)
136static inline void account_system_vtime(struct task_struct *tsk) 141static inline void account_system_vtime(struct task_struct *tsk)
137{ 142{
138} 143}
144#else
145extern void account_system_vtime(struct task_struct *tsk);
139#endif 146#endif
140 147
141#if defined(CONFIG_NO_HZ) 148#if defined(CONFIG_NO_HZ)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 61b4ecf1da50..0383601a927c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -875,6 +875,7 @@ enum sched_domain_level {
875 SD_LV_NONE = 0, 875 SD_LV_NONE = 0,
876 SD_LV_SIBLING, 876 SD_LV_SIBLING,
877 SD_LV_MC, 877 SD_LV_MC,
878 SD_LV_BOOK,
878 SD_LV_CPU, 879 SD_LV_CPU,
879 SD_LV_NODE, 880 SD_LV_NODE,
880 SD_LV_ALLNODES, 881 SD_LV_ALLNODES,
@@ -1690,8 +1691,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
1690/* 1691/*
1691 * Per process flags 1692 * Per process flags
1692 */ 1693 */
1693#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ 1694#define PF_KSOFTIRQD 0x00000001 /* I am ksoftirqd */
1694 /* Not implemented yet, only for 486*/
1695#define PF_STARTING 0x00000002 /* being created */ 1695#define PF_STARTING 0x00000002 /* being created */
1696#define PF_EXITING 0x00000004 /* getting shut down */ 1696#define PF_EXITING 0x00000004 /* getting shut down */
1697#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ 1697#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
@@ -1837,6 +1837,19 @@ extern void sched_clock_idle_sleep_event(void);
1837extern void sched_clock_idle_wakeup_event(u64 delta_ns); 1837extern void sched_clock_idle_wakeup_event(u64 delta_ns);
1838#endif 1838#endif
1839 1839
1840#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1841/*
1842 * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
1843 * The reason for this explicit opt-in is not to have perf penalty with
1844 * slow sched_clocks.
1845 */
1846extern void enable_sched_clock_irqtime(void);
1847extern void disable_sched_clock_irqtime(void);
1848#else
1849static inline void enable_sched_clock_irqtime(void) {}
1850static inline void disable_sched_clock_irqtime(void) {}
1851#endif
1852
1840extern unsigned long long 1853extern unsigned long long
1841task_sched_runtime(struct task_struct *task); 1854task_sched_runtime(struct task_struct *task);
1842extern unsigned long long thread_group_sched_runtime(struct task_struct *task); 1855extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
@@ -2378,9 +2391,9 @@ extern int __cond_resched_lock(spinlock_t *lock);
2378 2391
2379extern int __cond_resched_softirq(void); 2392extern int __cond_resched_softirq(void);
2380 2393
2381#define cond_resched_softirq() ({ \ 2394#define cond_resched_softirq() ({ \
2382 __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \ 2395 __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
2383 __cond_resched_softirq(); \ 2396 __cond_resched_softirq(); \
2384}) 2397})
2385 2398
2386/* 2399/*
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 64e084ff5e5c..b91a40e847d2 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -201,6 +201,12 @@ int arch_update_cpu_topology(void);
201 .balance_interval = 64, \ 201 .balance_interval = 64, \
202} 202}
203 203
204#ifdef CONFIG_SCHED_BOOK
205#ifndef SD_BOOK_INIT
206#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
207#endif
208#endif /* CONFIG_SCHED_BOOK */
209
204#ifdef CONFIG_NUMA 210#ifdef CONFIG_NUMA
205#ifndef SD_NODE_INIT 211#ifndef SD_NODE_INIT
206#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!! 212#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9208c92aeab5..f6334782a593 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -362,6 +362,35 @@ TRACE_EVENT(sched_stat_runtime,
362 (unsigned long long)__entry->vruntime) 362 (unsigned long long)__entry->vruntime)
363); 363);
364 364
365/*
366 * Tracepoint for showing priority inheritance modifying a tasks
367 * priority.
368 */
369TRACE_EVENT(sched_pi_setprio,
370
371 TP_PROTO(struct task_struct *tsk, int newprio),
372
373 TP_ARGS(tsk, newprio),
374
375 TP_STRUCT__entry(
376 __array( char, comm, TASK_COMM_LEN )
377 __field( pid_t, pid )
378 __field( int, oldprio )
379 __field( int, newprio )
380 ),
381
382 TP_fast_assign(
383 memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
384 __entry->pid = tsk->pid;
385 __entry->oldprio = tsk->prio;
386 __entry->newprio = newprio;
387 ),
388
389 TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
390 __entry->comm, __entry->pid,
391 __entry->oldprio, __entry->newprio)
392);
393
365#endif /* _TRACE_SCHED_H */ 394#endif /* _TRACE_SCHED_H */
366 395
367/* This part must be outside protection */ 396/* This part must be outside protection */
diff --git a/kernel/sched.c b/kernel/sched.c
index 5a5cc33e4999..d42992bccdfa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -426,9 +426,7 @@ struct root_domain {
426 */ 426 */
427 cpumask_var_t rto_mask; 427 cpumask_var_t rto_mask;
428 atomic_t rto_count; 428 atomic_t rto_count;
429#ifdef CONFIG_SMP
430 struct cpupri cpupri; 429 struct cpupri cpupri;
431#endif
432}; 430};
433 431
434/* 432/*
@@ -437,7 +435,7 @@ struct root_domain {
437 */ 435 */
438static struct root_domain def_root_domain; 436static struct root_domain def_root_domain;
439 437
440#endif 438#endif /* CONFIG_SMP */
441 439
442/* 440/*
443 * This is the main, per-CPU runqueue data structure. 441 * This is the main, per-CPU runqueue data structure.
@@ -488,11 +486,12 @@ struct rq {
488 */ 486 */
489 unsigned long nr_uninterruptible; 487 unsigned long nr_uninterruptible;
490 488
491 struct task_struct *curr, *idle; 489 struct task_struct *curr, *idle, *stop;
492 unsigned long next_balance; 490 unsigned long next_balance;
493 struct mm_struct *prev_mm; 491 struct mm_struct *prev_mm;
494 492
495 u64 clock; 493 u64 clock;
494 u64 clock_task;
496 495
497 atomic_t nr_iowait; 496 atomic_t nr_iowait;
498 497
@@ -520,6 +519,10 @@ struct rq {
520 u64 avg_idle; 519 u64 avg_idle;
521#endif 520#endif
522 521
522#ifdef CONFIG_IRQ_TIME_ACCOUNTING
523 u64 prev_irq_time;
524#endif
525
523 /* calc_load related fields */ 526 /* calc_load related fields */
524 unsigned long calc_load_update; 527 unsigned long calc_load_update;
525 long calc_load_active; 528 long calc_load_active;
@@ -643,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
643 646
644#endif /* CONFIG_CGROUP_SCHED */ 647#endif /* CONFIG_CGROUP_SCHED */
645 648
649static u64 irq_time_cpu(int cpu);
650static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
651
646inline void update_rq_clock(struct rq *rq) 652inline void update_rq_clock(struct rq *rq)
647{ 653{
648 if (!rq->skip_clock_update) 654 if (!rq->skip_clock_update) {
649 rq->clock = sched_clock_cpu(cpu_of(rq)); 655 int cpu = cpu_of(rq);
656 u64 irq_time;
657
658 rq->clock = sched_clock_cpu(cpu);
659 irq_time = irq_time_cpu(cpu);
660 if (rq->clock - irq_time > rq->clock_task)
661 rq->clock_task = rq->clock - irq_time;
662
663 sched_irq_time_avg_update(rq, irq_time);
664 }
650} 665}
651 666
652/* 667/*
@@ -723,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
723 size_t cnt, loff_t *ppos) 738 size_t cnt, loff_t *ppos)
724{ 739{
725 char buf[64]; 740 char buf[64];
726 char *cmp = buf; 741 char *cmp;
727 int neg = 0; 742 int neg = 0;
728 int i; 743 int i;
729 744
@@ -734,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
734 return -EFAULT; 749 return -EFAULT;
735 750
736 buf[cnt] = 0; 751 buf[cnt] = 0;
752 cmp = strstrip(buf);
737 753
738 if (strncmp(buf, "NO_", 3) == 0) { 754 if (strncmp(buf, "NO_", 3) == 0) {
739 neg = 1; 755 neg = 1;
@@ -741,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
741 } 757 }
742 758
743 for (i = 0; sched_feat_names[i]; i++) { 759 for (i = 0; sched_feat_names[i]; i++) {
744 int len = strlen(sched_feat_names[i]); 760 if (strcmp(cmp, sched_feat_names[i]) == 0) {
745
746 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
747 if (neg) 761 if (neg)
748 sysctl_sched_features &= ~(1UL << i); 762 sysctl_sched_features &= ~(1UL << i);
749 else 763 else
@@ -1840,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1840 1854
1841static const struct sched_class rt_sched_class; 1855static const struct sched_class rt_sched_class;
1842 1856
1843#define sched_class_highest (&rt_sched_class) 1857#define sched_class_highest (&stop_sched_class)
1844#define for_each_class(class) \ 1858#define for_each_class(class) \
1845 for (class = sched_class_highest; class; class = class->next) 1859 for (class = sched_class_highest; class; class = class->next)
1846 1860
@@ -1858,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
1858 1872
1859static void set_load_weight(struct task_struct *p) 1873static void set_load_weight(struct task_struct *p)
1860{ 1874{
1861 if (task_has_rt_policy(p)) {
1862 p->se.load.weight = 0;
1863 p->se.load.inv_weight = WMULT_CONST;
1864 return;
1865 }
1866
1867 /* 1875 /*
1868 * SCHED_IDLE tasks get minimal weight: 1876 * SCHED_IDLE tasks get minimal weight:
1869 */ 1877 */
@@ -1917,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1917 dec_nr_running(rq); 1925 dec_nr_running(rq);
1918} 1926}
1919 1927
1928#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1929
1930/*
1931 * There are no locks covering percpu hardirq/softirq time.
1932 * They are only modified in account_system_vtime, on corresponding CPU
1933 * with interrupts disabled. So, writes are safe.
1934 * They are read and saved off onto struct rq in update_rq_clock().
1935 * This may result in other CPU reading this CPU's irq time and can
1936 * race with irq/account_system_vtime on this CPU. We would either get old
1937 * or new value (or semi updated value on 32 bit) with a side effect of
1938 * accounting a slice of irq time to wrong task when irq is in progress
1939 * while we read rq->clock. That is a worthy compromise in place of having
1940 * locks on each irq in account_system_time.
1941 */
1942static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1943static DEFINE_PER_CPU(u64, cpu_softirq_time);
1944
1945static DEFINE_PER_CPU(u64, irq_start_time);
1946static int sched_clock_irqtime;
1947
1948void enable_sched_clock_irqtime(void)
1949{
1950 sched_clock_irqtime = 1;
1951}
1952
1953void disable_sched_clock_irqtime(void)
1954{
1955 sched_clock_irqtime = 0;
1956}
1957
1958static u64 irq_time_cpu(int cpu)
1959{
1960 if (!sched_clock_irqtime)
1961 return 0;
1962
1963 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1964}
1965
1966void account_system_vtime(struct task_struct *curr)
1967{
1968 unsigned long flags;
1969 int cpu;
1970 u64 now, delta;
1971
1972 if (!sched_clock_irqtime)
1973 return;
1974
1975 local_irq_save(flags);
1976
1977 cpu = smp_processor_id();
1978 now = sched_clock_cpu(cpu);
1979 delta = now - per_cpu(irq_start_time, cpu);
1980 per_cpu(irq_start_time, cpu) = now;
1981 /*
1982 * We do not account for softirq time from ksoftirqd here.
1983 * We want to continue accounting softirq time to ksoftirqd thread
1984 * in that case, so as not to confuse scheduler with a special task
1985 * that do not consume any time, but still wants to run.
1986 */
1987 if (hardirq_count())
1988 per_cpu(cpu_hardirq_time, cpu) += delta;
1989 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1990 per_cpu(cpu_softirq_time, cpu) += delta;
1991
1992 local_irq_restore(flags);
1993}
1994EXPORT_SYMBOL_GPL(account_system_vtime);
1995
1996static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
1997{
1998 if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
1999 u64 delta_irq = curr_irq_time - rq->prev_irq_time;
2000 rq->prev_irq_time = curr_irq_time;
2001 sched_rt_avg_update(rq, delta_irq);
2002 }
2003}
2004
2005#else
2006
2007static u64 irq_time_cpu(int cpu)
2008{
2009 return 0;
2010}
2011
2012static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
2013
2014#endif
2015
1920#include "sched_idletask.c" 2016#include "sched_idletask.c"
1921#include "sched_fair.c" 2017#include "sched_fair.c"
1922#include "sched_rt.c" 2018#include "sched_rt.c"
2019#include "sched_stoptask.c"
1923#ifdef CONFIG_SCHED_DEBUG 2020#ifdef CONFIG_SCHED_DEBUG
1924# include "sched_debug.c" 2021# include "sched_debug.c"
1925#endif 2022#endif
1926 2023
2024void sched_set_stop_task(int cpu, struct task_struct *stop)
2025{
2026 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2027 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2028
2029 if (stop) {
2030 /*
2031 * Make it appear like a SCHED_FIFO task, its something
2032 * userspace knows about and won't get confused about.
2033 *
2034 * Also, it will make PI more or less work without too
2035 * much confusion -- but then, stop work should not
2036 * rely on PI working anyway.
2037 */
2038 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2039
2040 stop->sched_class = &stop_sched_class;
2041 }
2042
2043 cpu_rq(cpu)->stop = stop;
2044
2045 if (old_stop) {
2046 /*
2047 * Reset it back to a normal scheduling class so that
2048 * it can die in pieces.
2049 */
2050 old_stop->sched_class = &rt_sched_class;
2051 }
2052}
2053
1927/* 2054/*
1928 * __normal_prio - return the priority that is based on the static prio 2055 * __normal_prio - return the priority that is based on the static prio
1929 */ 2056 */
@@ -2003,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2003 if (p->sched_class != &fair_sched_class) 2130 if (p->sched_class != &fair_sched_class)
2004 return 0; 2131 return 0;
2005 2132
2133 if (unlikely(p->policy == SCHED_IDLE))
2134 return 0;
2135
2006 /* 2136 /*
2007 * Buddy candidates are cache hot: 2137 * Buddy candidates are cache hot:
2008 */ 2138 */
@@ -2852,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2852 */ 2982 */
2853 arch_start_context_switch(prev); 2983 arch_start_context_switch(prev);
2854 2984
2855 if (likely(!mm)) { 2985 if (!mm) {
2856 next->active_mm = oldmm; 2986 next->active_mm = oldmm;
2857 atomic_inc(&oldmm->mm_count); 2987 atomic_inc(&oldmm->mm_count);
2858 enter_lazy_tlb(oldmm, next); 2988 enter_lazy_tlb(oldmm, next);
2859 } else 2989 } else
2860 switch_mm(oldmm, mm, next); 2990 switch_mm(oldmm, mm, next);
2861 2991
2862 if (likely(!prev->mm)) { 2992 if (!prev->mm) {
2863 prev->active_mm = NULL; 2993 prev->active_mm = NULL;
2864 rq->prev_mm = oldmm; 2994 rq->prev_mm = oldmm;
2865 } 2995 }
@@ -3248,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3248 3378
3249 if (task_current(rq, p)) { 3379 if (task_current(rq, p)) {
3250 update_rq_clock(rq); 3380 update_rq_clock(rq);
3251 ns = rq->clock - p->se.exec_start; 3381 ns = rq->clock_task - p->se.exec_start;
3252 if ((s64)ns < 0) 3382 if ((s64)ns < 0)
3253 ns = 0; 3383 ns = 0;
3254 } 3384 }
@@ -3397,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3397 tmp = cputime_to_cputime64(cputime); 3527 tmp = cputime_to_cputime64(cputime);
3398 if (hardirq_count() - hardirq_offset) 3528 if (hardirq_count() - hardirq_offset)
3399 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3529 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3400 else if (softirq_count()) 3530 else if (in_serving_softirq())
3401 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3531 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3402 else 3532 else
3403 cpustat->system = cputime64_add(cpustat->system, tmp); 3533 cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3723,17 +3853,13 @@ pick_next_task(struct rq *rq)
3723 return p; 3853 return p;
3724 } 3854 }
3725 3855
3726 class = sched_class_highest; 3856 for_each_class(class) {
3727 for ( ; ; ) {
3728 p = class->pick_next_task(rq); 3857 p = class->pick_next_task(rq);
3729 if (p) 3858 if (p)
3730 return p; 3859 return p;
3731 /*
3732 * Will never be NULL as the idle class always
3733 * returns a non-NULL p:
3734 */
3735 class = class->next;
3736 } 3860 }
3861
3862 BUG(); /* the idle class will always have a runnable task */
3737} 3863}
3738 3864
3739/* 3865/*
@@ -4358,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4358 4484
4359 rq = task_rq_lock(p, &flags); 4485 rq = task_rq_lock(p, &flags);
4360 4486
4487 trace_sched_pi_setprio(p, prio);
4361 oldprio = p->prio; 4488 oldprio = p->prio;
4362 prev_class = p->sched_class; 4489 prev_class = p->sched_class;
4363 on_rq = p->se.on_rq; 4490 on_rq = p->se.on_rq;
@@ -4661,6 +4788,15 @@ recheck:
4661 */ 4788 */
4662 rq = __task_rq_lock(p); 4789 rq = __task_rq_lock(p);
4663 4790
4791 /*
4792 * Changing the policy of the stop threads its a very bad idea
4793 */
4794 if (p == rq->stop) {
4795 __task_rq_unlock(rq);
4796 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4797 return -EINVAL;
4798 }
4799
4664#ifdef CONFIG_RT_GROUP_SCHED 4800#ifdef CONFIG_RT_GROUP_SCHED
4665 if (user) { 4801 if (user) {
4666 /* 4802 /*
@@ -4893,7 +5029,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4893 5029
4894 cpuset_cpus_allowed(p, cpus_allowed); 5030 cpuset_cpus_allowed(p, cpus_allowed);
4895 cpumask_and(new_mask, in_mask, cpus_allowed); 5031 cpumask_and(new_mask, in_mask, cpus_allowed);
4896 again: 5032again:
4897 retval = set_cpus_allowed_ptr(p, new_mask); 5033 retval = set_cpus_allowed_ptr(p, new_mask);
4898 5034
4899 if (!retval) { 5035 if (!retval) {
@@ -6526,6 +6662,7 @@ struct s_data {
6526 cpumask_var_t nodemask; 6662 cpumask_var_t nodemask;
6527 cpumask_var_t this_sibling_map; 6663 cpumask_var_t this_sibling_map;
6528 cpumask_var_t this_core_map; 6664 cpumask_var_t this_core_map;
6665 cpumask_var_t this_book_map;
6529 cpumask_var_t send_covered; 6666 cpumask_var_t send_covered;
6530 cpumask_var_t tmpmask; 6667 cpumask_var_t tmpmask;
6531 struct sched_group **sched_group_nodes; 6668 struct sched_group **sched_group_nodes;
@@ -6537,6 +6674,7 @@ enum s_alloc {
6537 sa_rootdomain, 6674 sa_rootdomain,
6538 sa_tmpmask, 6675 sa_tmpmask,
6539 sa_send_covered, 6676 sa_send_covered,
6677 sa_this_book_map,
6540 sa_this_core_map, 6678 sa_this_core_map,
6541 sa_this_sibling_map, 6679 sa_this_sibling_map,
6542 sa_nodemask, 6680 sa_nodemask,
@@ -6572,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
6572#ifdef CONFIG_SCHED_MC 6710#ifdef CONFIG_SCHED_MC
6573static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6711static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6574static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); 6712static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6575#endif /* CONFIG_SCHED_MC */
6576 6713
6577#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6578static int 6714static int
6579cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6715cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6580 struct sched_group **sg, struct cpumask *mask) 6716 struct sched_group **sg, struct cpumask *mask)
6581{ 6717{
6582 int group; 6718 int group;
6583 6719#ifdef CONFIG_SCHED_SMT
6584 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 6720 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6585 group = cpumask_first(mask); 6721 group = cpumask_first(mask);
6722#else
6723 group = cpu;
6724#endif
6586 if (sg) 6725 if (sg)
6587 *sg = &per_cpu(sched_group_core, group).sg; 6726 *sg = &per_cpu(sched_group_core, group).sg;
6588 return group; 6727 return group;
6589} 6728}
6590#elif defined(CONFIG_SCHED_MC) 6729#endif /* CONFIG_SCHED_MC */
6730
6731/*
6732 * book sched-domains:
6733 */
6734#ifdef CONFIG_SCHED_BOOK
6735static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6736static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6737
6591static int 6738static int
6592cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6739cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6593 struct sched_group **sg, struct cpumask *unused) 6740 struct sched_group **sg, struct cpumask *mask)
6594{ 6741{
6742 int group = cpu;
6743#ifdef CONFIG_SCHED_MC
6744 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6745 group = cpumask_first(mask);
6746#elif defined(CONFIG_SCHED_SMT)
6747 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6748 group = cpumask_first(mask);
6749#endif
6595 if (sg) 6750 if (sg)
6596 *sg = &per_cpu(sched_group_core, cpu).sg; 6751 *sg = &per_cpu(sched_group_book, group).sg;
6597 return cpu; 6752 return group;
6598} 6753}
6599#endif 6754#endif /* CONFIG_SCHED_BOOK */
6600 6755
6601static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6756static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
6602static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6757static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6606,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6606 struct sched_group **sg, struct cpumask *mask) 6761 struct sched_group **sg, struct cpumask *mask)
6607{ 6762{
6608 int group; 6763 int group;
6609#ifdef CONFIG_SCHED_MC 6764#ifdef CONFIG_SCHED_BOOK
6765 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6766 group = cpumask_first(mask);
6767#elif defined(CONFIG_SCHED_MC)
6610 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 6768 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6611 group = cpumask_first(mask); 6769 group = cpumask_first(mask);
6612#elif defined(CONFIG_SCHED_SMT) 6770#elif defined(CONFIG_SCHED_SMT)
@@ -6867,6 +7025,9 @@ SD_INIT_FUNC(CPU)
6867#ifdef CONFIG_SCHED_MC 7025#ifdef CONFIG_SCHED_MC
6868 SD_INIT_FUNC(MC) 7026 SD_INIT_FUNC(MC)
6869#endif 7027#endif
7028#ifdef CONFIG_SCHED_BOOK
7029 SD_INIT_FUNC(BOOK)
7030#endif
6870 7031
6871static int default_relax_domain_level = -1; 7032static int default_relax_domain_level = -1;
6872 7033
@@ -6916,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6916 free_cpumask_var(d->tmpmask); /* fall through */ 7077 free_cpumask_var(d->tmpmask); /* fall through */
6917 case sa_send_covered: 7078 case sa_send_covered:
6918 free_cpumask_var(d->send_covered); /* fall through */ 7079 free_cpumask_var(d->send_covered); /* fall through */
7080 case sa_this_book_map:
7081 free_cpumask_var(d->this_book_map); /* fall through */
6919 case sa_this_core_map: 7082 case sa_this_core_map:
6920 free_cpumask_var(d->this_core_map); /* fall through */ 7083 free_cpumask_var(d->this_core_map); /* fall through */
6921 case sa_this_sibling_map: 7084 case sa_this_sibling_map:
@@ -6962,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6962 return sa_nodemask; 7125 return sa_nodemask;
6963 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) 7126 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
6964 return sa_this_sibling_map; 7127 return sa_this_sibling_map;
6965 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) 7128 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
6966 return sa_this_core_map; 7129 return sa_this_core_map;
7130 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7131 return sa_this_book_map;
6967 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) 7132 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
6968 return sa_send_covered; 7133 return sa_send_covered;
6969 d->rd = alloc_rootdomain(); 7134 d->rd = alloc_rootdomain();
@@ -7021,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
7021 return sd; 7186 return sd;
7022} 7187}
7023 7188
7189static struct sched_domain *__build_book_sched_domain(struct s_data *d,
7190 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7191 struct sched_domain *parent, int i)
7192{
7193 struct sched_domain *sd = parent;
7194#ifdef CONFIG_SCHED_BOOK
7195 sd = &per_cpu(book_domains, i).sd;
7196 SD_INIT(sd, BOOK);
7197 set_domain_attribute(sd, attr);
7198 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7199 sd->parent = parent;
7200 parent->child = sd;
7201 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7202#endif
7203 return sd;
7204}
7205
7024static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7206static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
7025 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7207 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7026 struct sched_domain *parent, int i) 7208 struct sched_domain *parent, int i)
@@ -7078,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
7078 d->send_covered, d->tmpmask); 7260 d->send_covered, d->tmpmask);
7079 break; 7261 break;
7080#endif 7262#endif
7263#ifdef CONFIG_SCHED_BOOK
7264 case SD_LV_BOOK: /* set up book groups */
7265 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7266 if (cpu == cpumask_first(d->this_book_map))
7267 init_sched_build_groups(d->this_book_map, cpu_map,
7268 &cpu_to_book_group,
7269 d->send_covered, d->tmpmask);
7270 break;
7271#endif
7081 case SD_LV_CPU: /* set up physical groups */ 7272 case SD_LV_CPU: /* set up physical groups */
7082 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7273 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7083 if (!cpumask_empty(d->nodemask)) 7274 if (!cpumask_empty(d->nodemask))
@@ -7125,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7125 7316
7126 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7317 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7127 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7318 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7319 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
7128 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); 7320 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7129 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); 7321 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7130 } 7322 }
7131 7323
7132 for_each_cpu(i, cpu_map) { 7324 for_each_cpu(i, cpu_map) {
7133 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7325 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7326 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7134 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7327 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7135 } 7328 }
7136 7329
@@ -7161,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7161 init_sched_groups_power(i, sd); 7354 init_sched_groups_power(i, sd);
7162 } 7355 }
7163#endif 7356#endif
7357#ifdef CONFIG_SCHED_BOOK
7358 for_each_cpu(i, cpu_map) {
7359 sd = &per_cpu(book_domains, i).sd;
7360 init_sched_groups_power(i, sd);
7361 }
7362#endif
7164 7363
7165 for_each_cpu(i, cpu_map) { 7364 for_each_cpu(i, cpu_map) {
7166 sd = &per_cpu(phys_domains, i).sd; 7365 sd = &per_cpu(phys_domains, i).sd;
@@ -7186,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7186 sd = &per_cpu(cpu_domains, i).sd; 7385 sd = &per_cpu(cpu_domains, i).sd;
7187#elif defined(CONFIG_SCHED_MC) 7386#elif defined(CONFIG_SCHED_MC)
7188 sd = &per_cpu(core_domains, i).sd; 7387 sd = &per_cpu(core_domains, i).sd;
7388#elif defined(CONFIG_SCHED_BOOK)
7389 sd = &per_cpu(book_domains, i).sd;
7189#else 7390#else
7190 sd = &per_cpu(phys_domains, i).sd; 7391 sd = &per_cpu(phys_domains, i).sd;
7191#endif 7392#endif
@@ -8090,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8090 8291
8091 return 1; 8292 return 1;
8092 8293
8093 err_free_rq: 8294err_free_rq:
8094 kfree(cfs_rq); 8295 kfree(cfs_rq);
8095 err: 8296err:
8096 return 0; 8297 return 0;
8097} 8298}
8098 8299
@@ -8180,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8180 8381
8181 return 1; 8382 return 1;
8182 8383
8183 err_free_rq: 8384err_free_rq:
8184 kfree(rt_rq); 8385 kfree(rt_rq);
8185 err: 8386err:
8186 return 0; 8387 return 0;
8187} 8388}
8188 8389
@@ -8540,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
8540 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8741 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8541 } 8742 }
8542 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8743 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8543 unlock: 8744unlock:
8544 read_unlock(&tasklist_lock); 8745 read_unlock(&tasklist_lock);
8545 mutex_unlock(&rt_constraints_mutex); 8746 mutex_unlock(&rt_constraints_mutex);
8546 8747
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5f996d36ac5d..933f3d1b62ea 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,7 +25,7 @@
25 25
26/* 26/*
27 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
28 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) 28 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
29 * 29 *
30 * NOTE: this latency value is not the same as the concept of 30 * NOTE: this latency value is not the same as the concept of
31 * 'timeslice length' - timeslices in CFS are of variable length 31 * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 52
53/* 53/*
54 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 56 */
57unsigned int sysctl_sched_min_granularity = 750000ULL; 57unsigned int sysctl_sched_min_granularity = 750000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
519static void update_curr(struct cfs_rq *cfs_rq) 519static void update_curr(struct cfs_rq *cfs_rq)
520{ 520{
521 struct sched_entity *curr = cfs_rq->curr; 521 struct sched_entity *curr = cfs_rq->curr;
522 u64 now = rq_of(cfs_rq)->clock; 522 u64 now = rq_of(cfs_rq)->clock_task;
523 unsigned long delta_exec; 523 unsigned long delta_exec;
524 524
525 if (unlikely(!curr)) 525 if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
602 /* 602 /*
603 * We are starting a new run period: 603 * We are starting a new run period:
604 */ 604 */
605 se->exec_start = rq_of(cfs_rq)->clock; 605 se->exec_start = rq_of(cfs_rq)->clock_task;
606} 606}
607 607
608/************************************************** 608/**************************************************
@@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
1764 set_task_cpu(p, this_cpu); 1764 set_task_cpu(p, this_cpu);
1765 activate_task(this_rq, p, 0); 1765 activate_task(this_rq, p, 0);
1766 check_preempt_curr(this_rq, p, 0); 1766 check_preempt_curr(this_rq, p, 0);
1767
1768 /* re-arm NEWIDLE balancing when moving tasks */
1769 src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
1770 this_rq->idle_stamp = 0;
1767} 1771}
1768 1772
1769/* 1773/*
@@ -1798,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1798 * 2) too many balance attempts have failed. 1802 * 2) too many balance attempts have failed.
1799 */ 1803 */
1800 1804
1801 tsk_cache_hot = task_hot(p, rq->clock, sd); 1805 tsk_cache_hot = task_hot(p, rq->clock_task, sd);
1802 if (!tsk_cache_hot || 1806 if (!tsk_cache_hot ||
1803 sd->nr_balance_failed > sd->cache_nice_tries) { 1807 sd->nr_balance_failed > sd->cache_nice_tries) {
1804#ifdef CONFIG_SCHEDSTATS 1808#ifdef CONFIG_SCHEDSTATS
@@ -2030,12 +2034,14 @@ struct sd_lb_stats {
2030 unsigned long this_load; 2034 unsigned long this_load;
2031 unsigned long this_load_per_task; 2035 unsigned long this_load_per_task;
2032 unsigned long this_nr_running; 2036 unsigned long this_nr_running;
2037 unsigned long this_has_capacity;
2033 2038
2034 /* Statistics of the busiest group */ 2039 /* Statistics of the busiest group */
2035 unsigned long max_load; 2040 unsigned long max_load;
2036 unsigned long busiest_load_per_task; 2041 unsigned long busiest_load_per_task;
2037 unsigned long busiest_nr_running; 2042 unsigned long busiest_nr_running;
2038 unsigned long busiest_group_capacity; 2043 unsigned long busiest_group_capacity;
2044 unsigned long busiest_has_capacity;
2039 2045
2040 int group_imb; /* Is there imbalance in this sd */ 2046 int group_imb; /* Is there imbalance in this sd */
2041#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2047#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2058,6 +2064,7 @@ struct sg_lb_stats {
2058 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 2064 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2059 unsigned long group_capacity; 2065 unsigned long group_capacity;
2060 int group_imb; /* Is there an imbalance in the group ? */ 2066 int group_imb; /* Is there an imbalance in the group ? */
2067 int group_has_capacity; /* Is there extra capacity in the group? */
2061}; 2068};
2062 2069
2063/** 2070/**
@@ -2268,7 +2275,13 @@ unsigned long scale_rt_power(int cpu)
2268 u64 total, available; 2275 u64 total, available;
2269 2276
2270 total = sched_avg_period() + (rq->clock - rq->age_stamp); 2277 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2271 available = total - rq->rt_avg; 2278
2279 if (unlikely(total < rq->rt_avg)) {
2280 /* Ensures that power won't end up being negative */
2281 available = 0;
2282 } else {
2283 available = total - rq->rt_avg;
2284 }
2272 2285
2273 if (unlikely((s64)total < SCHED_LOAD_SCALE)) 2286 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2274 total = SCHED_LOAD_SCALE; 2287 total = SCHED_LOAD_SCALE;
@@ -2378,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2378 int local_group, const struct cpumask *cpus, 2391 int local_group, const struct cpumask *cpus,
2379 int *balance, struct sg_lb_stats *sgs) 2392 int *balance, struct sg_lb_stats *sgs)
2380{ 2393{
2381 unsigned long load, max_cpu_load, min_cpu_load; 2394 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
2382 int i; 2395 int i;
2383 unsigned int balance_cpu = -1, first_idle_cpu = 0; 2396 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2384 unsigned long avg_load_per_task = 0; 2397 unsigned long avg_load_per_task = 0;
@@ -2389,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2389 /* Tally up the load of all CPUs in the group */ 2402 /* Tally up the load of all CPUs in the group */
2390 max_cpu_load = 0; 2403 max_cpu_load = 0;
2391 min_cpu_load = ~0UL; 2404 min_cpu_load = ~0UL;
2405 max_nr_running = 0;
2392 2406
2393 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 2407 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2394 struct rq *rq = cpu_rq(i); 2408 struct rq *rq = cpu_rq(i);
@@ -2406,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2406 load = target_load(i, load_idx); 2420 load = target_load(i, load_idx);
2407 } else { 2421 } else {
2408 load = source_load(i, load_idx); 2422 load = source_load(i, load_idx);
2409 if (load > max_cpu_load) 2423 if (load > max_cpu_load) {
2410 max_cpu_load = load; 2424 max_cpu_load = load;
2425 max_nr_running = rq->nr_running;
2426 }
2411 if (min_cpu_load > load) 2427 if (min_cpu_load > load)
2412 min_cpu_load = load; 2428 min_cpu_load = load;
2413 } 2429 }
@@ -2447,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2447 if (sgs->sum_nr_running) 2463 if (sgs->sum_nr_running)
2448 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 2464 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2449 2465
2450 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 2466 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
2451 sgs->group_imb = 1; 2467 sgs->group_imb = 1;
2452 2468
2453 sgs->group_capacity = 2469 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2454 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2455 if (!sgs->group_capacity) 2470 if (!sgs->group_capacity)
2456 sgs->group_capacity = fix_small_capacity(sd, group); 2471 sgs->group_capacity = fix_small_capacity(sd, group);
2472
2473 if (sgs->group_capacity > sgs->sum_nr_running)
2474 sgs->group_has_capacity = 1;
2457} 2475}
2458 2476
2459/** 2477/**
@@ -2542,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2542 /* 2560 /*
2543 * In case the child domain prefers tasks go to siblings 2561 * In case the child domain prefers tasks go to siblings
2544 * first, lower the sg capacity to one so that we'll try 2562 * first, lower the sg capacity to one so that we'll try
2545 * and move all the excess tasks away. 2563 * and move all the excess tasks away. We lower the capacity
2564 * of a group only if the local group has the capacity to fit
2565 * these excess tasks, i.e. nr_running < group_capacity. The
2566 * extra check prevents the case where you always pull from the
2567 * heaviest group when it is already under-utilized (possible
2568 * with a large weight task outweighs the tasks on the system).
2546 */ 2569 */
2547 if (prefer_sibling) 2570 if (prefer_sibling && !local_group && sds->this_has_capacity)
2548 sgs.group_capacity = min(sgs.group_capacity, 1UL); 2571 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2549 2572
2550 if (local_group) { 2573 if (local_group) {
@@ -2552,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2552 sds->this = sg; 2575 sds->this = sg;
2553 sds->this_nr_running = sgs.sum_nr_running; 2576 sds->this_nr_running = sgs.sum_nr_running;
2554 sds->this_load_per_task = sgs.sum_weighted_load; 2577 sds->this_load_per_task = sgs.sum_weighted_load;
2578 sds->this_has_capacity = sgs.group_has_capacity;
2555 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 2579 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2556 sds->max_load = sgs.avg_load; 2580 sds->max_load = sgs.avg_load;
2557 sds->busiest = sg; 2581 sds->busiest = sg;
2558 sds->busiest_nr_running = sgs.sum_nr_running; 2582 sds->busiest_nr_running = sgs.sum_nr_running;
2559 sds->busiest_group_capacity = sgs.group_capacity; 2583 sds->busiest_group_capacity = sgs.group_capacity;
2560 sds->busiest_load_per_task = sgs.sum_weighted_load; 2584 sds->busiest_load_per_task = sgs.sum_weighted_load;
2585 sds->busiest_has_capacity = sgs.group_has_capacity;
2561 sds->group_imb = sgs.group_imb; 2586 sds->group_imb = sgs.group_imb;
2562 } 2587 }
2563 2588
@@ -2754,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2754 return fix_small_imbalance(sds, this_cpu, imbalance); 2779 return fix_small_imbalance(sds, this_cpu, imbalance);
2755 2780
2756} 2781}
2782
2757/******* find_busiest_group() helpers end here *********************/ 2783/******* find_busiest_group() helpers end here *********************/
2758 2784
2759/** 2785/**
@@ -2805,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2805 * 4) This group is more busy than the avg busieness at this 2831 * 4) This group is more busy than the avg busieness at this
2806 * sched_domain. 2832 * sched_domain.
2807 * 5) The imbalance is within the specified limit. 2833 * 5) The imbalance is within the specified limit.
2834 *
2835 * Note: when doing newidle balance, if the local group has excess
2836 * capacity (i.e. nr_running < group_capacity) and the busiest group
2837 * does not have any capacity, we force a load balance to pull tasks
2838 * to the local group. In this case, we skip past checks 3, 4 and 5.
2808 */ 2839 */
2809 if (!(*balance)) 2840 if (!(*balance))
2810 goto ret; 2841 goto ret;
@@ -2816,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2816 if (!sds.busiest || sds.busiest_nr_running == 0) 2847 if (!sds.busiest || sds.busiest_nr_running == 0)
2817 goto out_balanced; 2848 goto out_balanced;
2818 2849
2850 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
2851 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
2852 !sds.busiest_has_capacity)
2853 goto force_balance;
2854
2819 if (sds.this_load >= sds.max_load) 2855 if (sds.this_load >= sds.max_load)
2820 goto out_balanced; 2856 goto out_balanced;
2821 2857
@@ -2827,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2827 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 2863 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2828 goto out_balanced; 2864 goto out_balanced;
2829 2865
2866force_balance:
2830 /* Looks like there is an imbalance. Compute it */ 2867 /* Looks like there is an imbalance. Compute it */
2831 calculate_imbalance(&sds, this_cpu, imbalance); 2868 calculate_imbalance(&sds, this_cpu, imbalance);
2832 return sds.busiest; 2869 return sds.busiest;
@@ -3031,7 +3068,14 @@ redo:
3031 3068
3032 if (!ld_moved) { 3069 if (!ld_moved) {
3033 schedstat_inc(sd, lb_failed[idle]); 3070 schedstat_inc(sd, lb_failed[idle]);
3034 sd->nr_balance_failed++; 3071 /*
3072 * Increment the failure counter only on periodic balance.
3073 * We do not want newidle balance, which can be very
3074 * frequent, pollute the failure counter causing
3075 * excessive cache_hot migrations and active balances.
3076 */
3077 if (idle != CPU_NEWLY_IDLE)
3078 sd->nr_balance_failed++;
3035 3079
3036 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), 3080 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3037 this_cpu)) { 3081 this_cpu)) {
@@ -3153,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3153 interval = msecs_to_jiffies(sd->balance_interval); 3197 interval = msecs_to_jiffies(sd->balance_interval);
3154 if (time_after(next_balance, sd->last_balance + interval)) 3198 if (time_after(next_balance, sd->last_balance + interval))
3155 next_balance = sd->last_balance + interval; 3199 next_balance = sd->last_balance + interval;
3156 if (pulled_task) { 3200 if (pulled_task)
3157 this_rq->idle_stamp = 0;
3158 break; 3201 break;
3159 }
3160 } 3202 }
3161 3203
3162 raw_spin_lock(&this_rq->lock); 3204 raw_spin_lock(&this_rq->lock);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 83c66e8ad3ee..185f920ec1a2 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
61 * release the lock. Decreases scheduling overhead. 61 * release the lock. Decreases scheduling overhead.
62 */ 62 */
63SCHED_FEAT(OWNER_SPIN, 1) 63SCHED_FEAT(OWNER_SPIN, 1)
64
65/*
66 * Decrement CPU power based on irq activity
67 */
68SCHED_FEAT(NONIRQ_POWER, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d10c80ebb67a..bea7d79f7e9c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq)
609 if (!task_has_rt_policy(curr)) 609 if (!task_has_rt_policy(curr))
610 return; 610 return;
611 611
612 delta_exec = rq->clock - curr->se.exec_start; 612 delta_exec = rq->clock_task - curr->se.exec_start;
613 if (unlikely((s64)delta_exec < 0)) 613 if (unlikely((s64)delta_exec < 0))
614 delta_exec = 0; 614 delta_exec = 0;
615 615
@@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq)
618 curr->se.sum_exec_runtime += delta_exec; 618 curr->se.sum_exec_runtime += delta_exec;
619 account_group_exec_runtime(curr, delta_exec); 619 account_group_exec_runtime(curr, delta_exec);
620 620
621 curr->se.exec_start = rq->clock; 621 curr->se.exec_start = rq->clock_task;
622 cpuacct_charge(curr, delta_exec); 622 cpuacct_charge(curr, delta_exec);
623 623
624 sched_rt_avg_update(rq, delta_exec); 624 sched_rt_avg_update(rq, delta_exec);
@@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
960 * runqueue. Otherwise simply start this RT task 960 * runqueue. Otherwise simply start this RT task
961 * on its current runqueue. 961 * on its current runqueue.
962 * 962 *
963 * We want to avoid overloading runqueues. Even if 963 * We want to avoid overloading runqueues. If the woken
964 * the RT task is of higher priority than the current RT task. 964 * task is a higher priority, then it will stay on this CPU
965 * RT tasks behave differently than other tasks. If 965 * and the lower prio task should be moved to another CPU.
966 * one gets preempted, we try to push it off to another queue. 966 * Even though this will probably make the lower prio task
967 * So trying to keep a preempting RT task on the same 967 * lose its cache, we do not want to bounce a higher task
968 * cache hot CPU will force the running RT task to 968 * around just because it gave up its CPU, perhaps for a
969 * a cold CPU. So we waste all the cache for the lower 969 * lock?
970 * RT task in hopes of saving some of a RT task 970 *
971 * that is just being woken and probably will have 971 * For equal prio tasks, we just let the scheduler sort it out.
972 * cold cache anyway.
973 */ 972 */
974 if (unlikely(rt_task(rq->curr)) && 973 if (unlikely(rt_task(rq->curr)) &&
974 (rq->curr->rt.nr_cpus_allowed < 2 ||
975 rq->curr->prio < p->prio) &&
975 (p->rt.nr_cpus_allowed > 1)) { 976 (p->rt.nr_cpus_allowed > 1)) {
976 int cpu = find_lowest_rq(p); 977 int cpu = find_lowest_rq(p);
977 978
@@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1074 } while (rt_rq); 1075 } while (rt_rq);
1075 1076
1076 p = rt_task_of(rt_se); 1077 p = rt_task_of(rt_se);
1077 p->se.exec_start = rq->clock; 1078 p->se.exec_start = rq->clock_task;
1078 1079
1079 return p; 1080 return p;
1080} 1081}
@@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1139 for_each_leaf_rt_rq(rt_rq, rq) { 1140 for_each_leaf_rt_rq(rt_rq, rq) {
1140 array = &rt_rq->active; 1141 array = &rt_rq->active;
1141 idx = sched_find_first_bit(array->bitmap); 1142 idx = sched_find_first_bit(array->bitmap);
1142 next_idx: 1143next_idx:
1143 if (idx >= MAX_RT_PRIO) 1144 if (idx >= MAX_RT_PRIO)
1144 continue; 1145 continue;
1145 if (next && next->prio < idx) 1146 if (next && next->prio < idx)
@@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq)
1315 if (!next_task) 1316 if (!next_task)
1316 return 0; 1317 return 0;
1317 1318
1318 retry: 1319retry:
1319 if (unlikely(next_task == rq->curr)) { 1320 if (unlikely(next_task == rq->curr)) {
1320 WARN_ON(1); 1321 WARN_ON(1);
1321 return 0; 1322 return 0;
@@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq)
1463 * but possible) 1464 * but possible)
1464 */ 1465 */
1465 } 1466 }
1466 skip: 1467skip:
1467 double_unlock_balance(this_rq, src_rq); 1468 double_unlock_balance(this_rq, src_rq);
1468 } 1469 }
1469 1470
@@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1491 if (!task_running(rq, p) && 1492 if (!task_running(rq, p) &&
1492 !test_tsk_need_resched(rq->curr) && 1493 !test_tsk_need_resched(rq->curr) &&
1493 has_pushable_tasks(rq) && 1494 has_pushable_tasks(rq) &&
1494 p->rt.nr_cpus_allowed > 1) 1495 p->rt.nr_cpus_allowed > 1 &&
1496 rt_task(rq->curr) &&
1497 (rq->curr->rt.nr_cpus_allowed < 2 ||
1498 rq->curr->prio < p->prio))
1495 push_rt_tasks(rq); 1499 push_rt_tasks(rq);
1496} 1500}
1497 1501
@@ -1709,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
1709{ 1713{
1710 struct task_struct *p = rq->curr; 1714 struct task_struct *p = rq->curr;
1711 1715
1712 p->se.exec_start = rq->clock; 1716 p->se.exec_start = rq->clock_task;
1713 1717
1714 /* The running task is never eligible for pushing */ 1718 /* The running task is never eligible for pushing */
1715 dequeue_pushable_task(rq, p); 1719 dequeue_pushable_task(rq, p);
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
new file mode 100644
index 000000000000..45bddc0c1048
--- /dev/null
+++ b/kernel/sched_stoptask.c
@@ -0,0 +1,108 @@
1/*
2 * stop-task scheduling class.
3 *
4 * The stop task is the highest priority task in the system, it preempts
5 * everything and will be preempted by nothing.
6 *
7 * See kernel/stop_machine.c
8 */
9
10#ifdef CONFIG_SMP
11static int
12select_task_rq_stop(struct rq *rq, struct task_struct *p,
13 int sd_flag, int flags)
14{
15 return task_cpu(p); /* stop tasks as never migrate */
16}
17#endif /* CONFIG_SMP */
18
19static void
20check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
21{
22 resched_task(rq->curr); /* we preempt everything */
23}
24
25static struct task_struct *pick_next_task_stop(struct rq *rq)
26{
27 struct task_struct *stop = rq->stop;
28
29 if (stop && stop->state == TASK_RUNNING)
30 return stop;
31
32 return NULL;
33}
34
35static void
36enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
37{
38}
39
40static void
41dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
42{
43}
44
45static void yield_task_stop(struct rq *rq)
46{
47 BUG(); /* the stop task should never yield, its pointless. */
48}
49
50static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
51{
52}
53
54static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
55{
56}
57
58static void set_curr_task_stop(struct rq *rq)
59{
60}
61
62static void switched_to_stop(struct rq *rq, struct task_struct *p,
63 int running)
64{
65 BUG(); /* its impossible to change to this class */
66}
67
68static void prio_changed_stop(struct rq *rq, struct task_struct *p,
69 int oldprio, int running)
70{
71 BUG(); /* how!?, what priority? */
72}
73
74static unsigned int
75get_rr_interval_stop(struct rq *rq, struct task_struct *task)
76{
77 return 0;
78}
79
80/*
81 * Simple, special scheduling class for the per-CPU stop tasks:
82 */
83static const struct sched_class stop_sched_class = {
84 .next = &rt_sched_class,
85
86 .enqueue_task = enqueue_task_stop,
87 .dequeue_task = dequeue_task_stop,
88 .yield_task = yield_task_stop,
89
90 .check_preempt_curr = check_preempt_curr_stop,
91
92 .pick_next_task = pick_next_task_stop,
93 .put_prev_task = put_prev_task_stop,
94
95#ifdef CONFIG_SMP
96 .select_task_rq = select_task_rq_stop,
97#endif
98
99 .set_curr_task = set_curr_task_stop,
100 .task_tick = task_tick_stop,
101
102 .get_rr_interval = get_rr_interval_stop,
103
104 .prio_changed = prio_changed_stop,
105 .switched_to = switched_to_stop,
106
107 /* no .task_new for stop tasks */
108};
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 07b4f1b1a73a..79ee8f1fc0e7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -77,11 +77,21 @@ void wakeup_softirqd(void)
77} 77}
78 78
79/* 79/*
80 * preempt_count and SOFTIRQ_OFFSET usage:
81 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
82 * softirq processing.
83 * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
84 * on local_bh_disable or local_bh_enable.
85 * This lets us distinguish between whether we are currently processing
86 * softirq and whether we just have bh disabled.
87 */
88
89/*
80 * This one is for softirq.c-internal use, 90 * This one is for softirq.c-internal use,
81 * where hardirqs are disabled legitimately: 91 * where hardirqs are disabled legitimately:
82 */ 92 */
83#ifdef CONFIG_TRACE_IRQFLAGS 93#ifdef CONFIG_TRACE_IRQFLAGS
84static void __local_bh_disable(unsigned long ip) 94static void __local_bh_disable(unsigned long ip, unsigned int cnt)
85{ 95{
86 unsigned long flags; 96 unsigned long flags;
87 97
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip)
95 * We must manually increment preempt_count here and manually 105 * We must manually increment preempt_count here and manually
96 * call the trace_preempt_off later. 106 * call the trace_preempt_off later.
97 */ 107 */
98 preempt_count() += SOFTIRQ_OFFSET; 108 preempt_count() += cnt;
99 /* 109 /*
100 * Were softirqs turned off above: 110 * Were softirqs turned off above:
101 */ 111 */
102 if (softirq_count() == SOFTIRQ_OFFSET) 112 if (softirq_count() == cnt)
103 trace_softirqs_off(ip); 113 trace_softirqs_off(ip);
104 raw_local_irq_restore(flags); 114 raw_local_irq_restore(flags);
105 115
106 if (preempt_count() == SOFTIRQ_OFFSET) 116 if (preempt_count() == cnt)
107 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 117 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
108} 118}
109#else /* !CONFIG_TRACE_IRQFLAGS */ 119#else /* !CONFIG_TRACE_IRQFLAGS */
110static inline void __local_bh_disable(unsigned long ip) 120static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
111{ 121{
112 add_preempt_count(SOFTIRQ_OFFSET); 122 add_preempt_count(cnt);
113 barrier(); 123 barrier();
114} 124}
115#endif /* CONFIG_TRACE_IRQFLAGS */ 125#endif /* CONFIG_TRACE_IRQFLAGS */
116 126
117void local_bh_disable(void) 127void local_bh_disable(void)
118{ 128{
119 __local_bh_disable((unsigned long)__builtin_return_address(0)); 129 __local_bh_disable((unsigned long)__builtin_return_address(0),
130 SOFTIRQ_DISABLE_OFFSET);
120} 131}
121 132
122EXPORT_SYMBOL(local_bh_disable); 133EXPORT_SYMBOL(local_bh_disable);
123 134
135static void __local_bh_enable(unsigned int cnt)
136{
137 WARN_ON_ONCE(in_irq());
138 WARN_ON_ONCE(!irqs_disabled());
139
140 if (softirq_count() == cnt)
141 trace_softirqs_on((unsigned long)__builtin_return_address(0));
142 sub_preempt_count(cnt);
143}
144
124/* 145/*
125 * Special-case - softirqs can safely be enabled in 146 * Special-case - softirqs can safely be enabled in
126 * cond_resched_softirq(), or by __do_softirq(), 147 * cond_resched_softirq(), or by __do_softirq(),
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable);
128 */ 149 */
129void _local_bh_enable(void) 150void _local_bh_enable(void)
130{ 151{
131 WARN_ON_ONCE(in_irq()); 152 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
132 WARN_ON_ONCE(!irqs_disabled());
133
134 if (softirq_count() == SOFTIRQ_OFFSET)
135 trace_softirqs_on((unsigned long)__builtin_return_address(0));
136 sub_preempt_count(SOFTIRQ_OFFSET);
137} 153}
138 154
139EXPORT_SYMBOL(_local_bh_enable); 155EXPORT_SYMBOL(_local_bh_enable);
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
147 /* 163 /*
148 * Are softirqs going to be turned on now: 164 * Are softirqs going to be turned on now:
149 */ 165 */
150 if (softirq_count() == SOFTIRQ_OFFSET) 166 if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
151 trace_softirqs_on(ip); 167 trace_softirqs_on(ip);
152 /* 168 /*
153 * Keep preemption disabled until we are done with 169 * Keep preemption disabled until we are done with
154 * softirq processing: 170 * softirq processing:
155 */ 171 */
156 sub_preempt_count(SOFTIRQ_OFFSET - 1); 172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
157 173
158 if (unlikely(!in_interrupt() && local_softirq_pending())) 174 if (unlikely(!in_interrupt() && local_softirq_pending()))
159 do_softirq(); 175 do_softirq();
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void)
198 pending = local_softirq_pending(); 214 pending = local_softirq_pending();
199 account_system_vtime(current); 215 account_system_vtime(current);
200 216
201 __local_bh_disable((unsigned long)__builtin_return_address(0)); 217 __local_bh_disable((unsigned long)__builtin_return_address(0),
218 SOFTIRQ_OFFSET);
202 lockdep_softirq_enter(); 219 lockdep_softirq_enter();
203 220
204 cpu = smp_processor_id(); 221 cpu = smp_processor_id();
@@ -245,7 +262,7 @@ restart:
245 lockdep_softirq_exit(); 262 lockdep_softirq_exit();
246 263
247 account_system_vtime(current); 264 account_system_vtime(current);
248 _local_bh_enable(); 265 __local_bh_enable(SOFTIRQ_OFFSET);
249} 266}
250 267
251#ifndef __ARCH_HAS_DO_SOFTIRQ 268#ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -279,10 +296,16 @@ void irq_enter(void)
279 296
280 rcu_irq_enter(); 297 rcu_irq_enter();
281 if (idle_cpu(cpu) && !in_interrupt()) { 298 if (idle_cpu(cpu) && !in_interrupt()) {
282 __irq_enter(); 299 /*
300 * Prevent raise_softirq from needlessly waking up ksoftirqd
301 * here, as softirq will be serviced on return from interrupt.
302 */
303 local_bh_disable();
283 tick_check_idle(cpu); 304 tick_check_idle(cpu);
284 } else 305 _local_bh_enable();
285 __irq_enter(); 306 }
307
308 __irq_enter();
286} 309}
287 310
288#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 311#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -696,6 +719,7 @@ static int run_ksoftirqd(void * __bind_cpu)
696{ 719{
697 set_current_state(TASK_INTERRUPTIBLE); 720 set_current_state(TASK_INTERRUPTIBLE);
698 721
722 current->flags |= PF_KSOFTIRQD;
699 while (!kthread_should_stop()) { 723 while (!kthread_should_stop()) {
700 preempt_disable(); 724 preempt_disable();
701 if (!local_softirq_pending()) { 725 if (!local_softirq_pending()) {
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 4372ccb25127..090c28812ce1 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -287,11 +287,12 @@ repeat:
287 goto repeat; 287 goto repeat;
288} 288}
289 289
290extern void sched_set_stop_task(int cpu, struct task_struct *stop);
291
290/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ 292/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
291static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, 293static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
292 unsigned long action, void *hcpu) 294 unsigned long action, void *hcpu)
293{ 295{
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu; 296 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 297 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct task_struct *p; 298 struct task_struct *p;
@@ -304,13 +305,13 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
304 cpu); 305 cpu);
305 if (IS_ERR(p)) 306 if (IS_ERR(p))
306 return NOTIFY_BAD; 307 return NOTIFY_BAD;
307 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
308 get_task_struct(p); 308 get_task_struct(p);
309 kthread_bind(p, cpu);
310 sched_set_stop_task(cpu, p);
309 stopper->thread = p; 311 stopper->thread = p;
310 break; 312 break;
311 313
312 case CPU_ONLINE: 314 case CPU_ONLINE:
313 kthread_bind(stopper->thread, cpu);
314 /* strictly unnecessary, as first user will wake it */ 315 /* strictly unnecessary, as first user will wake it */
315 wake_up_process(stopper->thread); 316 wake_up_process(stopper->thread);
316 /* mark enabled */ 317 /* mark enabled */
@@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
325 { 326 {
326 struct cpu_stop_work *work; 327 struct cpu_stop_work *work;
327 328
329 sched_set_stop_task(cpu, NULL);
328 /* kill the stopper */ 330 /* kill the stopper */
329 kthread_stop(stopper->thread); 331 kthread_stop(stopper->thread);
330 /* drain remaining works */ 332 /* drain remaining works */
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 78ef2c5e130b..37dff78e9cb1 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -123,7 +123,7 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
123 * calls by looking at the number of nested bh disable calls because 123 * calls by looking at the number of nested bh disable calls because
124 * softirqs always disables bh. 124 * softirqs always disables bh.
125 */ 125 */
126 if (softirq_count() != SOFTIRQ_OFFSET) { 126 if (in_serving_softirq()) {
127 /* If there is an sk_classid we'll use that. */ 127 /* If there is an sk_classid we'll use that. */
128 if (!skb->sk) 128 if (!skb->sk)
129 return -1; 129 return -1;