aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Schwidefsky <schwidefsky@de.ibm.com>2011-12-19 13:23:15 -0500
committerMartin Schwidefsky <schwidefsky@de.ibm.com>2011-12-19 13:23:15 -0500
commit612ef28a045efadb3a98d4492ead7806a146485d (patch)
tree05621c87b37e91c27b06d450d76adffe97ce9666
parentc3e0ef9a298e028a82ada28101ccd5cf64d209ee (diff)
parent07cde2608a3b5c66515363f1b53623b1536b9785 (diff)
Merge branch 'sched/core' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into cputime-tip
Conflicts: drivers/cpufreq/cpufreq_conservative.c drivers/cpufreq/cpufreq_ondemand.c drivers/macintosh/rack-meter.c fs/proc/stat.c fs/proc/uptime.c kernel/sched/core.c
-rw-r--r--arch/s390/appldata/appldata_os.c16
-rw-r--r--arch/x86/include/asm/i387.h2
-rw-r--r--drivers/cpufreq/cpufreq_conservative.c41
-rw-r--r--drivers/cpufreq/cpufreq_ondemand.c41
-rw-r--r--drivers/macintosh/rack-meter.c7
-rw-r--r--fs/proc/stat.c52
-rw-r--r--fs/proc/uptime.c4
-rw-r--r--include/linux/kernel_stat.h36
-rw-r--r--include/linux/latencytop.h3
-rw-r--r--include/linux/sched.h19
-rw-r--r--include/trace/events/sched.h7
-rw-r--r--kernel/Makefile20
-rw-r--r--kernel/sched/Makefile20
-rw-r--r--kernel/sched/auto_group.c (renamed from kernel/sched_autogroup.c)33
-rw-r--r--kernel/sched/auto_group.h (renamed from kernel/sched_autogroup.h)26
-rw-r--r--kernel/sched/clock.c (renamed from kernel/sched_clock.c)0
-rw-r--r--kernel/sched/core.c (renamed from kernel/sched.c)2098
-rw-r--r--kernel/sched/cpupri.c (renamed from kernel/sched_cpupri.c)4
-rw-r--r--kernel/sched/cpupri.h (renamed from kernel/sched_cpupri.h)0
-rw-r--r--kernel/sched/debug.c (renamed from kernel/sched_debug.c)6
-rw-r--r--kernel/sched/fair.c (renamed from kernel/sched_fair.c)929
-rw-r--r--kernel/sched/features.h (renamed from kernel/sched_features.h)30
-rw-r--r--kernel/sched/idle_task.c (renamed from kernel/sched_idletask.c)4
-rw-r--r--kernel/sched/rt.c (renamed from kernel/sched_rt.c)218
-rw-r--r--kernel/sched/sched.h1136
-rw-r--r--kernel/sched/stats.c111
-rw-r--r--kernel/sched/stats.h (renamed from kernel/sched_stats.h)103
-rw-r--r--kernel/sched/stop_task.c (renamed from kernel/sched_stoptask.c)4
-rw-r--r--kernel/time/tick-sched.c9
29 files changed, 2606 insertions, 2373 deletions
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index 92f1cb745d6..4de031d6b76 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -115,21 +115,21 @@ static void appldata_get_os_data(void *data)
115 j = 0; 115 j = 0;
116 for_each_online_cpu(i) { 116 for_each_online_cpu(i) {
117 os_data->os_cpu[j].per_cpu_user = 117 os_data->os_cpu[j].per_cpu_user =
118 cputime_to_jiffies(kstat_cpu(i).cpustat.user); 118 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_USER]);
119 os_data->os_cpu[j].per_cpu_nice = 119 os_data->os_cpu[j].per_cpu_nice =
120 cputime_to_jiffies(kstat_cpu(i).cpustat.nice); 120 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_NICE]);
121 os_data->os_cpu[j].per_cpu_system = 121 os_data->os_cpu[j].per_cpu_system =
122 cputime_to_jiffies(kstat_cpu(i).cpustat.system); 122 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]);
123 os_data->os_cpu[j].per_cpu_idle = 123 os_data->os_cpu[j].per_cpu_idle =
124 cputime_to_jiffies(kstat_cpu(i).cpustat.idle); 124 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IDLE]);
125 os_data->os_cpu[j].per_cpu_irq = 125 os_data->os_cpu[j].per_cpu_irq =
126 cputime_to_jiffies(kstat_cpu(i).cpustat.irq); 126 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IRQ]);
127 os_data->os_cpu[j].per_cpu_softirq = 127 os_data->os_cpu[j].per_cpu_softirq =
128 cputime_to_jiffies(kstat_cpu(i).cpustat.softirq); 128 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]);
129 os_data->os_cpu[j].per_cpu_iowait = 129 os_data->os_cpu[j].per_cpu_iowait =
130 cputime_to_jiffies(kstat_cpu(i).cpustat.iowait); 130 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IOWAIT]);
131 os_data->os_cpu[j].per_cpu_steal = 131 os_data->os_cpu[j].per_cpu_steal =
132 cputime_to_jiffies(kstat_cpu(i).cpustat.steal); 132 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_STEAL]);
133 os_data->os_cpu[j].cpu_id = i; 133 os_data->os_cpu[j].cpu_id = i;
134 j++; 134 j++;
135 } 135 }
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c9e09ea0564..6919e936345 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -218,7 +218,7 @@ static inline void fpu_fxsave(struct fpu *fpu)
218#ifdef CONFIG_SMP 218#ifdef CONFIG_SMP
219#define safe_address (__per_cpu_offset[0]) 219#define safe_address (__per_cpu_offset[0])
220#else 220#else
221#define safe_address (kstat_cpu(0).cpustat.user) 221#define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER])
222#endif 222#endif
223 223
224/* 224/*
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 7f31a031c0b..235a340e81f 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -95,26 +95,26 @@ static struct dbs_tuners {
95 .freq_step = 5, 95 .freq_step = 5,
96}; 96};
97 97
98static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, 98static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
99 cputime64_t *wall)
100{ 99{
101 cputime64_t idle_time; 100 u64 idle_time;
102 cputime64_t cur_wall_time; 101 u64 cur_wall_time;
103 cputime64_t busy_time; 102 u64 busy_time;
104 103
105 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); 104 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
106 busy_time = kstat_cpu(cpu).cpustat.user; 105
107 busy_time += kstat_cpu(cpu).cpustat.system; 106 busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER];
108 busy_time += kstat_cpu(cpu).cpustat.irq; 107 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
109 busy_time += kstat_cpu(cpu).cpustat.softirq; 108 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
110 busy_time += kstat_cpu(cpu).cpustat.steal; 109 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
111 busy_time += kstat_cpu(cpu).cpustat.nice; 110 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
111 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
112 112
113 idle_time = cur_wall_time - busy_time; 113 idle_time = cur_wall_time - busy_time;
114 if (wall) 114 if (wall)
115 *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time); 115 *wall = jiffies_to_usecs(cur_wall_time);
116 116
117 return (cputime64_t)jiffies_to_usecs(idle_time); 117 return jiffies_to_usecs(idle_time);
118} 118}
119 119
120static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) 120static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
@@ -271,7 +271,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
271 dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 271 dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
272 &dbs_info->prev_cpu_wall); 272 &dbs_info->prev_cpu_wall);
273 if (dbs_tuners_ins.ignore_nice) 273 if (dbs_tuners_ins.ignore_nice)
274 dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; 274 dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
275 } 275 }
276 return count; 276 return count;
277} 277}
@@ -361,11 +361,11 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
361 j_dbs_info->prev_cpu_idle = cur_idle_time; 361 j_dbs_info->prev_cpu_idle = cur_idle_time;
362 362
363 if (dbs_tuners_ins.ignore_nice) { 363 if (dbs_tuners_ins.ignore_nice) {
364 cputime64_t cur_nice; 364 u64 cur_nice;
365 unsigned long cur_nice_jiffies; 365 unsigned long cur_nice_jiffies;
366 366
367 cur_nice = kstat_cpu(j).cpustat.nice - 367 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] -
368 j_dbs_info->prev_cpu_nice; 368 j_dbs_info->prev_cpu_nice;
369 /* 369 /*
370 * Assumption: nice time between sampling periods will 370 * Assumption: nice time between sampling periods will
371 * be less than 2^32 jiffies for 32 bit sys 371 * be less than 2^32 jiffies for 32 bit sys
@@ -373,7 +373,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
373 cur_nice_jiffies = (unsigned long) 373 cur_nice_jiffies = (unsigned long)
374 cputime64_to_jiffies64(cur_nice); 374 cputime64_to_jiffies64(cur_nice);
375 375
376 j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; 376 j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
377 idle_time += jiffies_to_usecs(cur_nice_jiffies); 377 idle_time += jiffies_to_usecs(cur_nice_jiffies);
378 } 378 }
379 379
@@ -500,10 +500,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
500 500
501 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 501 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
502 &j_dbs_info->prev_cpu_wall); 502 &j_dbs_info->prev_cpu_wall);
503 if (dbs_tuners_ins.ignore_nice) { 503 if (dbs_tuners_ins.ignore_nice)
504 j_dbs_info->prev_cpu_nice = 504 j_dbs_info->prev_cpu_nice =
505 kstat_cpu(j).cpustat.nice; 505 kcpustat_cpu(j).cpustat[CPUTIME_NICE];
506 }
507 } 506 }
508 this_dbs_info->down_skip = 0; 507 this_dbs_info->down_skip = 0;
509 this_dbs_info->requested_freq = policy->cur; 508 this_dbs_info->requested_freq = policy->cur;
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 07cffe2f6cf..3d679eee70a 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -119,26 +119,26 @@ static struct dbs_tuners {
119 .powersave_bias = 0, 119 .powersave_bias = 0,
120}; 120};
121 121
122static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, 122static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
123 cputime64_t *wall)
124{ 123{
125 cputime64_t idle_time; 124 u64 idle_time;
126 cputime64_t cur_wall_time; 125 u64 cur_wall_time;
127 cputime64_t busy_time; 126 u64 busy_time;
128 127
129 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); 128 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
130 busy_time = kstat_cpu(cpu).cpustat.user; 129
131 busy_time += kstat_cpu(cpu).cpustat.system; 130 busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER];
132 busy_time += kstat_cpu(cpu).cpustat.irq; 131 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
133 busy_time += kstat_cpu(cpu).cpustat.softirq; 132 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
134 busy_time += kstat_cpu(cpu).cpustat.steal; 133 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
135 busy_time += kstat_cpu(cpu).cpustat.nice; 134 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
135 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
136 136
137 idle_time = cur_wall_time - busy_time; 137 idle_time = cur_wall_time - busy_time;
138 if (wall) 138 if (wall)
139 *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time); 139 *wall = jiffies_to_usecs(cur_wall_time);
140 140
141 return (cputime64_t)jiffies_to_usecs(idle_time); 141 return jiffies_to_usecs(idle_time);
142} 142}
143 143
144static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) 144static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
@@ -344,7 +344,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
344 dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 344 dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
345 &dbs_info->prev_cpu_wall); 345 &dbs_info->prev_cpu_wall);
346 if (dbs_tuners_ins.ignore_nice) 346 if (dbs_tuners_ins.ignore_nice)
347 dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; 347 dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
348 348
349 } 349 }
350 return count; 350 return count;
@@ -454,11 +454,11 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
454 j_dbs_info->prev_cpu_iowait = cur_iowait_time; 454 j_dbs_info->prev_cpu_iowait = cur_iowait_time;
455 455
456 if (dbs_tuners_ins.ignore_nice) { 456 if (dbs_tuners_ins.ignore_nice) {
457 cputime64_t cur_nice; 457 u64 cur_nice;
458 unsigned long cur_nice_jiffies; 458 unsigned long cur_nice_jiffies;
459 459
460 cur_nice = kstat_cpu(j).cpustat.nice - 460 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] -
461 j_dbs_info->prev_cpu_nice; 461 j_dbs_info->prev_cpu_nice;
462 /* 462 /*
463 * Assumption: nice time between sampling periods will 463 * Assumption: nice time between sampling periods will
464 * be less than 2^32 jiffies for 32 bit sys 464 * be less than 2^32 jiffies for 32 bit sys
@@ -466,7 +466,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
466 cur_nice_jiffies = (unsigned long) 466 cur_nice_jiffies = (unsigned long)
467 cputime64_to_jiffies64(cur_nice); 467 cputime64_to_jiffies64(cur_nice);
468 468
469 j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; 469 j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
470 idle_time += jiffies_to_usecs(cur_nice_jiffies); 470 idle_time += jiffies_to_usecs(cur_nice_jiffies);
471 } 471 }
472 472
@@ -645,10 +645,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
645 645
646 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 646 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
647 &j_dbs_info->prev_cpu_wall); 647 &j_dbs_info->prev_cpu_wall);
648 if (dbs_tuners_ins.ignore_nice) { 648 if (dbs_tuners_ins.ignore_nice)
649 j_dbs_info->prev_cpu_nice = 649 j_dbs_info->prev_cpu_nice =
650 kstat_cpu(j).cpustat.nice; 650 kcpustat_cpu(j).cpustat[CPUTIME_NICE];
651 }
652 } 651 }
653 this_dbs_info->cpu = cpu; 652 this_dbs_info->cpu = cpu;
654 this_dbs_info->rate_mult = 1; 653 this_dbs_info->rate_mult = 1;
diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c
index 909908ebf16..6dc26b61219 100644
--- a/drivers/macintosh/rack-meter.c
+++ b/drivers/macintosh/rack-meter.c
@@ -81,12 +81,13 @@ static int rackmeter_ignore_nice;
81 */ 81 */
82static inline cputime64_t get_cpu_idle_time(unsigned int cpu) 82static inline cputime64_t get_cpu_idle_time(unsigned int cpu)
83{ 83{
84 cputime64_t retval; 84 u64 retval;
85 85
86 retval = kstat_cpu(cpu).cpustat.idle + kstat_cpu(cpu).cpustat.iowait; 86 retval = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE] +
87 kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
87 88
88 if (rackmeter_ignore_nice) 89 if (rackmeter_ignore_nice)
89 retval += kstat_cpu(cpu).cpustat.nice; 90 retval += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
90 91
91 return retval; 92 return retval;
92} 93}
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 714d5d131e7..2527a68057f 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -22,14 +22,13 @@
22#define arch_idle_time(cpu) 0 22#define arch_idle_time(cpu) 0
23#endif 23#endif
24 24
25static cputime64_t get_idle_time(int cpu) 25static u64 get_idle_time(int cpu)
26{ 26{
27 u64 idle_time = get_cpu_idle_time_us(cpu, NULL); 27 u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL);
28 cputime64_t idle;
29 28
30 if (idle_time == -1ULL) { 29 if (idle_time == -1ULL) {
31 /* !NO_HZ so we can rely on cpustat.idle */ 30 /* !NO_HZ so we can rely on cpustat.idle */
32 idle = kstat_cpu(cpu).cpustat.idle; 31 idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
33 idle += arch_idle_time(cpu); 32 idle += arch_idle_time(cpu);
34 } else 33 } else
35 idle = nsecs_to_jiffies64(1000 * idle_time); 34 idle = nsecs_to_jiffies64(1000 * idle_time);
@@ -37,14 +36,13 @@ static cputime64_t get_idle_time(int cpu)
37 return idle; 36 return idle;
38} 37}
39 38
40static cputime64_t get_iowait_time(int cpu) 39static u64 get_iowait_time(int cpu)
41{ 40{
42 u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL); 41 u64 iowait, iowait_time = get_cpu_iowait_time_us(cpu, NULL);
43 cputime64_t iowait;
44 42
45 if (iowait_time == -1ULL) 43 if (iowait_time == -1ULL)
46 /* !NO_HZ so we can rely on cpustat.iowait */ 44 /* !NO_HZ so we can rely on cpustat.iowait */
47 iowait = kstat_cpu(cpu).cpustat.iowait; 45 iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
48 else 46 else
49 iowait = nsecs_to_jiffies64(1000 * iowait_time); 47 iowait = nsecs_to_jiffies64(1000 * iowait_time);
50 48
@@ -55,8 +53,8 @@ static int show_stat(struct seq_file *p, void *v)
55{ 53{
56 int i, j; 54 int i, j;
57 unsigned long jif; 55 unsigned long jif;
58 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; 56 u64 user, nice, system, idle, iowait, irq, softirq, steal;
59 cputime64_t guest, guest_nice; 57 u64 guest, guest_nice;
60 u64 sum = 0; 58 u64 sum = 0;
61 u64 sum_softirq = 0; 59 u64 sum_softirq = 0;
62 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; 60 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
@@ -69,18 +67,16 @@ static int show_stat(struct seq_file *p, void *v)
69 jif = boottime.tv_sec; 67 jif = boottime.tv_sec;
70 68
71 for_each_possible_cpu(i) { 69 for_each_possible_cpu(i) {
72 user += kstat_cpu(i).cpustat.user; 70 user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
73 nice += kstat_cpu(i).cpustat.nice; 71 nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
74 system += kstat_cpu(i).cpustat.system; 72 system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
75 idle += get_idle_time(i); 73 idle += get_idle_time(i);
76 iowait += get_iowait_time(i); 74 iowait += get_iowait_time(i);
77 irq += kstat_cpu(i).cpustat.irq; 75 irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
78 softirq += kstat_cpu(i).cpustat.softirq; 76 softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
79 steal += kstat_cpu(i).cpustat.steal; 77 steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
80 guest += kstat_cpu(i).cpustat.guest; 78 guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
81 guest_nice += kstat_cpu(i).cpustat.guest_nice; 79 guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
82 sum += kstat_cpu_irqs_sum(i);
83 sum += arch_irq_stat_cpu(i);
84 80
85 for (j = 0; j < NR_SOFTIRQS; j++) { 81 for (j = 0; j < NR_SOFTIRQS; j++) {
86 unsigned int softirq_stat = kstat_softirqs_cpu(j, i); 82 unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
@@ -105,16 +101,16 @@ static int show_stat(struct seq_file *p, void *v)
105 (unsigned long long)cputime64_to_clock_t(guest_nice)); 101 (unsigned long long)cputime64_to_clock_t(guest_nice));
106 for_each_online_cpu(i) { 102 for_each_online_cpu(i) {
107 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ 103 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
108 user = kstat_cpu(i).cpustat.user; 104 user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
109 nice = kstat_cpu(i).cpustat.nice; 105 nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
110 system = kstat_cpu(i).cpustat.system; 106 system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
111 idle = get_idle_time(i); 107 idle = get_idle_time(i);
112 iowait = get_iowait_time(i); 108 iowait = get_iowait_time(i);
113 irq = kstat_cpu(i).cpustat.irq; 109 irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
114 softirq = kstat_cpu(i).cpustat.softirq; 110 softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
115 steal = kstat_cpu(i).cpustat.steal; 111 steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
116 guest = kstat_cpu(i).cpustat.guest; 112 guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
117 guest_nice = kstat_cpu(i).cpustat.guest_nice; 113 guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
118 seq_printf(p, 114 seq_printf(p,
119 "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " 115 "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
120 "%llu\n", 116 "%llu\n",
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index ab515109fec..9610ac772d7 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -11,14 +11,14 @@ static int uptime_proc_show(struct seq_file *m, void *v)
11{ 11{
12 struct timespec uptime; 12 struct timespec uptime;
13 struct timespec idle; 13 struct timespec idle;
14 cputime64_t idletime; 14 u64 idletime;
15 u64 nsec; 15 u64 nsec;
16 u32 rem; 16 u32 rem;
17 int i; 17 int i;
18 18
19 idletime = 0; 19 idletime = 0;
20 for_each_possible_cpu(i) 20 for_each_possible_cpu(i)
21 idletime += kstat_cpu(i).cpustat.idle; 21 idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
22 22
23 do_posix_clock_monotonic_gettime(&uptime); 23 do_posix_clock_monotonic_gettime(&uptime);
24 monotonic_to_bootbased(&uptime); 24 monotonic_to_bootbased(&uptime);
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 0cce2db580c..2fbd9053c2d 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -6,6 +6,7 @@
6#include <linux/percpu.h> 6#include <linux/percpu.h>
7#include <linux/cpumask.h> 7#include <linux/cpumask.h>
8#include <linux/interrupt.h> 8#include <linux/interrupt.h>
9#include <linux/sched.h>
9#include <asm/irq.h> 10#include <asm/irq.h>
10#include <asm/cputime.h> 11#include <asm/cputime.h>
11 12
@@ -15,21 +16,25 @@
15 * used by rstatd/perfmeter 16 * used by rstatd/perfmeter
16 */ 17 */
17 18
18struct cpu_usage_stat { 19enum cpu_usage_stat {
19 cputime64_t user; 20 CPUTIME_USER,
20 cputime64_t nice; 21 CPUTIME_NICE,
21 cputime64_t system; 22 CPUTIME_SYSTEM,
22 cputime64_t softirq; 23 CPUTIME_SOFTIRQ,
23 cputime64_t irq; 24 CPUTIME_IRQ,
24 cputime64_t idle; 25 CPUTIME_IDLE,
25 cputime64_t iowait; 26 CPUTIME_IOWAIT,
26 cputime64_t steal; 27 CPUTIME_STEAL,
27 cputime64_t guest; 28 CPUTIME_GUEST,
28 cputime64_t guest_nice; 29 CPUTIME_GUEST_NICE,
30 NR_STATS,
31};
32
33struct kernel_cpustat {
34 u64 cpustat[NR_STATS];
29}; 35};
30 36
31struct kernel_stat { 37struct kernel_stat {
32 struct cpu_usage_stat cpustat;
33#ifndef CONFIG_GENERIC_HARDIRQS 38#ifndef CONFIG_GENERIC_HARDIRQS
34 unsigned int irqs[NR_IRQS]; 39 unsigned int irqs[NR_IRQS];
35#endif 40#endif
@@ -38,10 +43,13 @@ struct kernel_stat {
38}; 43};
39 44
40DECLARE_PER_CPU(struct kernel_stat, kstat); 45DECLARE_PER_CPU(struct kernel_stat, kstat);
46DECLARE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
41 47
42#define kstat_cpu(cpu) per_cpu(kstat, cpu)
43/* Must have preemption disabled for this to be meaningful. */ 48/* Must have preemption disabled for this to be meaningful. */
44#define kstat_this_cpu __get_cpu_var(kstat) 49#define kstat_this_cpu (&__get_cpu_var(kstat))
50#define kcpustat_this_cpu (&__get_cpu_var(kernel_cpustat))
51#define kstat_cpu(cpu) per_cpu(kstat, cpu)
52#define kcpustat_cpu(cpu) per_cpu(kernel_cpustat, cpu)
45 53
46extern unsigned long long nr_context_switches(void); 54extern unsigned long long nr_context_switches(void);
47 55
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
index b0e99898527..e23121f9d82 100644
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -10,6 +10,8 @@
10#define _INCLUDE_GUARD_LATENCYTOP_H_ 10#define _INCLUDE_GUARD_LATENCYTOP_H_
11 11
12#include <linux/compiler.h> 12#include <linux/compiler.h>
13struct task_struct;
14
13#ifdef CONFIG_LATENCYTOP 15#ifdef CONFIG_LATENCYTOP
14 16
15#define LT_SAVECOUNT 32 17#define LT_SAVECOUNT 32
@@ -23,7 +25,6 @@ struct latency_record {
23}; 25};
24 26
25 27
26struct task_struct;
27 28
28extern int latencytop_enabled; 29extern int latencytop_enabled;
29void __account_scheduler_latency(struct task_struct *task, int usecs, int inter); 30void __account_scheduler_latency(struct task_struct *task, int usecs, int inter);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5649032d73f..5a2ab3c2757 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -273,9 +273,11 @@ extern int runqueue_is_locked(int cpu);
273 273
274#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) 274#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
275extern void select_nohz_load_balancer(int stop_tick); 275extern void select_nohz_load_balancer(int stop_tick);
276extern void set_cpu_sd_state_idle(void);
276extern int get_nohz_timer_target(void); 277extern int get_nohz_timer_target(void);
277#else 278#else
278static inline void select_nohz_load_balancer(int stop_tick) { } 279static inline void select_nohz_load_balancer(int stop_tick) { }
280static inline void set_cpu_sd_state_idle(void) { }
279#endif 281#endif
280 282
281/* 283/*
@@ -901,6 +903,10 @@ struct sched_group_power {
901 * single CPU. 903 * single CPU.
902 */ 904 */
903 unsigned int power, power_orig; 905 unsigned int power, power_orig;
906 /*
907 * Number of busy cpus in this group.
908 */
909 atomic_t nr_busy_cpus;
904}; 910};
905 911
906struct sched_group { 912struct sched_group {
@@ -925,6 +931,15 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
925 return to_cpumask(sg->cpumask); 931 return to_cpumask(sg->cpumask);
926} 932}
927 933
934/**
935 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
936 * @group: The group whose first cpu is to be returned.
937 */
938static inline unsigned int group_first_cpu(struct sched_group *group)
939{
940 return cpumask_first(sched_group_cpus(group));
941}
942
928struct sched_domain_attr { 943struct sched_domain_attr {
929 int relax_domain_level; 944 int relax_domain_level;
930}; 945};
@@ -1315,8 +1330,8 @@ struct task_struct {
1315 * older sibling, respectively. (p->father can be replaced with 1330 * older sibling, respectively. (p->father can be replaced with
1316 * p->real_parent->pid) 1331 * p->real_parent->pid)
1317 */ 1332 */
1318 struct task_struct *real_parent; /* real parent process */ 1333 struct task_struct __rcu *real_parent; /* real parent process */
1319 struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */ 1334 struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
1320 /* 1335 /*
1321 * children/sibling forms the list of my natural children 1336 * children/sibling forms the list of my natural children
1322 */ 1337 */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 959ff18b63b..e33ed1bfa11 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -331,6 +331,13 @@ DEFINE_EVENT(sched_stat_template, sched_stat_iowait,
331 TP_ARGS(tsk, delay)); 331 TP_ARGS(tsk, delay));
332 332
333/* 333/*
334 * Tracepoint for accounting blocked time (time the task is in uninterruptible).
335 */
336DEFINE_EVENT(sched_stat_template, sched_stat_blocked,
337 TP_PROTO(struct task_struct *tsk, u64 delay),
338 TP_ARGS(tsk, delay));
339
340/*
334 * Tracepoint for accounting runtime (time the task is executing 341 * Tracepoint for accounting runtime (time the task is executing
335 * on a CPU). 342 * on a CPU).
336 */ 343 */
diff --git a/kernel/Makefile b/kernel/Makefile
index e898c5b9d02..f70396e5a24 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,16 +2,15 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ 5obj-y = fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o cred.o \
13 async.o range.o 13 async.o range.o groups.o
14obj-y += groups.o
15 14
16ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
17# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
@@ -20,10 +19,11 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg
20CFLAGS_REMOVE_mutex-debug.o = -pg 19CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 20CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 21CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_irq_work.o = -pg 22CFLAGS_REMOVE_irq_work.o = -pg
25endif 23endif
26 24
25obj-y += sched/
26
27obj-$(CONFIG_FREEZER) += freezer.o 27obj-$(CONFIG_FREEZER) += freezer.o
28obj-$(CONFIG_PROFILING) += profile.o 28obj-$(CONFIG_PROFILING) += profile.o
29obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 29obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
@@ -99,7 +99,6 @@ obj-$(CONFIG_TRACING) += trace/
99obj-$(CONFIG_X86_DS) += trace/ 99obj-$(CONFIG_X86_DS) += trace/
100obj-$(CONFIG_RING_BUFFER) += trace/ 100obj-$(CONFIG_RING_BUFFER) += trace/
101obj-$(CONFIG_TRACEPOINTS) += trace/ 101obj-$(CONFIG_TRACEPOINTS) += trace/
102obj-$(CONFIG_SMP) += sched_cpupri.o
103obj-$(CONFIG_IRQ_WORK) += irq_work.o 102obj-$(CONFIG_IRQ_WORK) += irq_work.o
104obj-$(CONFIG_CPU_PM) += cpu_pm.o 103obj-$(CONFIG_CPU_PM) += cpu_pm.o
105 104
@@ -110,15 +109,6 @@ obj-$(CONFIG_PADATA) += padata.o
110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 109obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
111obj-$(CONFIG_JUMP_LABEL) += jump_label.o 110obj-$(CONFIG_JUMP_LABEL) += jump_label.o
112 111
113ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
114# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
115# needed for x86 only. Why this used to be enabled for all architectures is beyond
116# me. I suspect most platforms don't need this, but until we know that for sure
117# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
118# to get a correct value for the wait-channel (WCHAN in ps). --davidm
119CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
120endif
121
122$(obj)/configs.o: $(obj)/config_data.h 112$(obj)/configs.o: $(obj)/config_data.h
123 113
124# config_data.h contains the same information as ikconfig.h but gzipped. 114# config_data.h contains the same information as ikconfig.h but gzipped.
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
new file mode 100644
index 00000000000..9a7dd35102a
--- /dev/null
+++ b/kernel/sched/Makefile
@@ -0,0 +1,20 @@
1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_clock.o = -pg
3endif
4
5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
6# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
7# needed for x86 only. Why this used to be enabled for all architectures is beyond
8# me. I suspect most platforms don't need this, but until we know that for sure
9# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
10# to get a correct value for the wait-channel (WCHAN in ps). --davidm
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif
13
14obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19
20
diff --git a/kernel/sched_autogroup.c b/kernel/sched/auto_group.c
index 429242f3c48..e8a1f83ee0e 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched/auto_group.c
@@ -1,15 +1,19 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3#include "sched.h"
4
3#include <linux/proc_fs.h> 5#include <linux/proc_fs.h>
4#include <linux/seq_file.h> 6#include <linux/seq_file.h>
5#include <linux/kallsyms.h> 7#include <linux/kallsyms.h>
6#include <linux/utsname.h> 8#include <linux/utsname.h>
9#include <linux/security.h>
10#include <linux/export.h>
7 11
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; 12unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default; 13static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr; 14static atomic_t autogroup_seq_nr;
11 15
12static void __init autogroup_init(struct task_struct *init_task) 16void __init autogroup_init(struct task_struct *init_task)
13{ 17{
14 autogroup_default.tg = &root_task_group; 18 autogroup_default.tg = &root_task_group;
15 kref_init(&autogroup_default.kref); 19 kref_init(&autogroup_default.kref);
@@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task)
17 init_task->signal->autogroup = &autogroup_default; 21 init_task->signal->autogroup = &autogroup_default;
18} 22}
19 23
20static inline void autogroup_free(struct task_group *tg) 24void autogroup_free(struct task_group *tg)
21{ 25{
22 kfree(tg->autogroup); 26 kfree(tg->autogroup);
23} 27}
@@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
59 return ag; 63 return ag;
60} 64}
61 65
62#ifdef CONFIG_RT_GROUP_SCHED
63static void free_rt_sched_group(struct task_group *tg);
64#endif
65
66static inline struct autogroup *autogroup_create(void) 66static inline struct autogroup *autogroup_create(void)
67{ 67{
68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); 68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -108,8 +108,7 @@ out_fail:
108 return autogroup_kref_get(&autogroup_default); 108 return autogroup_kref_get(&autogroup_default);
109} 109}
110 110
111static inline bool 111bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
112task_wants_autogroup(struct task_struct *p, struct task_group *tg)
113{ 112{
114 if (tg != &root_task_group) 113 if (tg != &root_task_group)
115 return false; 114 return false;
@@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
127 return true; 126 return true;
128} 127}
129 128
130static inline bool task_group_is_autogroup(struct task_group *tg)
131{
132 return !!tg->autogroup;
133}
134
135static inline struct task_group *
136autogroup_task_group(struct task_struct *p, struct task_group *tg)
137{
138 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
139
140 if (enabled && task_wants_autogroup(p, tg))
141 return p->signal->autogroup->tg;
142
143 return tg;
144}
145
146static void 129static void
147autogroup_move_group(struct task_struct *p, struct autogroup *ag) 130autogroup_move_group(struct task_struct *p, struct autogroup *ag)
148{ 131{
@@ -263,7 +246,7 @@ out:
263#endif /* CONFIG_PROC_FS */ 246#endif /* CONFIG_PROC_FS */
264 247
265#ifdef CONFIG_SCHED_DEBUG 248#ifdef CONFIG_SCHED_DEBUG
266static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 249int autogroup_path(struct task_group *tg, char *buf, int buflen)
267{ 250{
268 if (!task_group_is_autogroup(tg)) 251 if (!task_group_is_autogroup(tg))
269 return 0; 252 return 0;
diff --git a/kernel/sched_autogroup.h b/kernel/sched/auto_group.h
index c2f0e7248dc..8bd04714281 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched/auto_group.h
@@ -1,5 +1,8 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3#include <linux/kref.h>
4#include <linux/rwsem.h>
5
3struct autogroup { 6struct autogroup {
4 /* 7 /*
5 * reference doesn't mean how many thread attach to this 8 * reference doesn't mean how many thread attach to this
@@ -13,9 +16,28 @@ struct autogroup {
13 int nice; 16 int nice;
14}; 17};
15 18
16static inline bool task_group_is_autogroup(struct task_group *tg); 19extern void autogroup_init(struct task_struct *init_task);
20extern void autogroup_free(struct task_group *tg);
21
22static inline bool task_group_is_autogroup(struct task_group *tg)
23{
24 return !!tg->autogroup;
25}
26
27extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
28
17static inline struct task_group * 29static inline struct task_group *
18autogroup_task_group(struct task_struct *p, struct task_group *tg); 30autogroup_task_group(struct task_struct *p, struct task_group *tg)
31{
32 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
33
34 if (enabled && task_wants_autogroup(p, tg))
35 return p->signal->autogroup->tg;
36
37 return tg;
38}
39
40extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
19 41
20#else /* !CONFIG_SCHED_AUTOGROUP */ 42#else /* !CONFIG_SCHED_AUTOGROUP */
21 43
diff --git a/kernel/sched_clock.c b/kernel/sched/clock.c
index c685e31492d..c685e31492d 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched/clock.c
diff --git a/kernel/sched.c b/kernel/sched/core.c
index 18cad4467e6..cdf51a2adc2 100644
--- a/kernel/sched.c
+++ b/kernel/sched/core.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/sched.c 2 * kernel/sched/core.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
@@ -56,7 +56,6 @@
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/proc_fs.h> 57#include <linux/proc_fs.h>
58#include <linux/seq_file.h> 58#include <linux/seq_file.h>
59#include <linux/stop_machine.h>
60#include <linux/sysctl.h> 59#include <linux/sysctl.h>
61#include <linux/syscalls.h> 60#include <linux/syscalls.h>
62#include <linux/times.h> 61#include <linux/times.h>
@@ -75,129 +74,17 @@
75 74
76#include <asm/tlb.h> 75#include <asm/tlb.h>
77#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
78#include <asm/mutex.h>
79#ifdef CONFIG_PARAVIRT 77#ifdef CONFIG_PARAVIRT
80#include <asm/paravirt.h> 78#include <asm/paravirt.h>
81#endif 79#endif
82 80
83#include "sched_cpupri.h" 81#include "sched.h"
84#include "workqueue_sched.h" 82#include "../workqueue_sched.h"
85#include "sched_autogroup.h"
86 83
87#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
88#include <trace/events/sched.h> 85#include <trace/events/sched.h>
89 86
90/* 87void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
91 * Convert user-nice values [ -20 ... 0 ... 19 ]
92 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
93 * and back.
94 */
95#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
96#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
97#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
98
99/*
100 * 'User priority' is the nice value converted to something we
101 * can work with better when scaling various scheduler parameters,
102 * it's a [ 0 ... 39 ] range.
103 */
104#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
105#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
106#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
107
108/*
109 * Helpers for converting nanosecond timing to jiffy resolution
110 */
111#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
112
113#define NICE_0_LOAD SCHED_LOAD_SCALE
114#define NICE_0_SHIFT SCHED_LOAD_SHIFT
115
116/*
117 * These are the 'tuning knobs' of the scheduler:
118 *
119 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
120 * Timeslices get refilled after they expire.
121 */
122#define DEF_TIMESLICE (100 * HZ / 1000)
123
124/*
125 * single value that denotes runtime == period, ie unlimited time.
126 */
127#define RUNTIME_INF ((u64)~0ULL)
128
129static inline int rt_policy(int policy)
130{
131 if (policy == SCHED_FIFO || policy == SCHED_RR)
132 return 1;
133 return 0;
134}
135
136static inline int task_has_rt_policy(struct task_struct *p)
137{
138 return rt_policy(p->policy);
139}
140
141/*
142 * This is the priority-queue data structure of the RT scheduling class:
143 */
144struct rt_prio_array {
145 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
146 struct list_head queue[MAX_RT_PRIO];
147};
148
149struct rt_bandwidth {
150 /* nests inside the rq lock: */
151 raw_spinlock_t rt_runtime_lock;
152 ktime_t rt_period;
153 u64 rt_runtime;
154 struct hrtimer rt_period_timer;
155};
156
157static struct rt_bandwidth def_rt_bandwidth;
158
159static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
160
161static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
162{
163 struct rt_bandwidth *rt_b =
164 container_of(timer, struct rt_bandwidth, rt_period_timer);
165 ktime_t now;
166 int overrun;
167 int idle = 0;
168
169 for (;;) {
170 now = hrtimer_cb_get_time(timer);
171 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
172
173 if (!overrun)
174 break;
175
176 idle = do_sched_rt_period_timer(rt_b, overrun);
177 }
178
179 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
180}
181
182static
183void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
184{
185 rt_b->rt_period = ns_to_ktime(period);
186 rt_b->rt_runtime = runtime;
187
188 raw_spin_lock_init(&rt_b->rt_runtime_lock);
189
190 hrtimer_init(&rt_b->rt_period_timer,
191 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
192 rt_b->rt_period_timer.function = sched_rt_period_timer;
193}
194
195static inline int rt_bandwidth_enabled(void)
196{
197 return sysctl_sched_rt_runtime >= 0;
198}
199
200static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
201{ 88{
202 unsigned long delta; 89 unsigned long delta;
203 ktime_t soft, hard, now; 90 ktime_t soft, hard, now;
@@ -217,580 +104,12 @@ static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
217 } 104 }
218} 105}
219 106
220static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 107DEFINE_MUTEX(sched_domains_mutex);
221{ 108DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
222 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
223 return;
224
225 if (hrtimer_active(&rt_b->rt_period_timer))
226 return;
227
228 raw_spin_lock(&rt_b->rt_runtime_lock);
229 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
230 raw_spin_unlock(&rt_b->rt_runtime_lock);
231}
232
233#ifdef CONFIG_RT_GROUP_SCHED
234static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
235{
236 hrtimer_cancel(&rt_b->rt_period_timer);
237}
238#endif
239
240/*
241 * sched_domains_mutex serializes calls to init_sched_domains,
242 * detach_destroy_domains and partition_sched_domains.
243 */
244static DEFINE_MUTEX(sched_domains_mutex);
245
246#ifdef CONFIG_CGROUP_SCHED
247
248#include <linux/cgroup.h>
249
250struct cfs_rq;
251
252static LIST_HEAD(task_groups);
253
254struct cfs_bandwidth {
255#ifdef CONFIG_CFS_BANDWIDTH
256 raw_spinlock_t lock;
257 ktime_t period;
258 u64 quota, runtime;
259 s64 hierarchal_quota;
260 u64 runtime_expires;
261
262 int idle, timer_active;
263 struct hrtimer period_timer, slack_timer;
264 struct list_head throttled_cfs_rq;
265
266 /* statistics */
267 int nr_periods, nr_throttled;
268 u64 throttled_time;
269#endif
270};
271
272/* task group related information */
273struct task_group {
274 struct cgroup_subsys_state css;
275
276#ifdef CONFIG_FAIR_GROUP_SCHED
277 /* schedulable entities of this group on each cpu */
278 struct sched_entity **se;
279 /* runqueue "owned" by this group on each cpu */
280 struct cfs_rq **cfs_rq;
281 unsigned long shares;
282
283 atomic_t load_weight;
284#endif
285
286#ifdef CONFIG_RT_GROUP_SCHED
287 struct sched_rt_entity **rt_se;
288 struct rt_rq **rt_rq;
289
290 struct rt_bandwidth rt_bandwidth;
291#endif
292
293 struct rcu_head rcu;
294 struct list_head list;
295
296 struct task_group *parent;
297 struct list_head siblings;
298 struct list_head children;
299
300#ifdef CONFIG_SCHED_AUTOGROUP
301 struct autogroup *autogroup;
302#endif
303
304 struct cfs_bandwidth cfs_bandwidth;
305};
306
307/* task_group_lock serializes the addition/removal of task groups */
308static DEFINE_SPINLOCK(task_group_lock);
309
310#ifdef CONFIG_FAIR_GROUP_SCHED
311
312# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
313
314/*
315 * A weight of 0 or 1 can cause arithmetics problems.
316 * A weight of a cfs_rq is the sum of weights of which entities
317 * are queued on this cfs_rq, so a weight of a entity should not be
318 * too large, so as the shares value of a task group.
319 * (The default weight is 1024 - so there's no practical
320 * limitation from this.)
321 */
322#define MIN_SHARES (1UL << 1)
323#define MAX_SHARES (1UL << 18)
324
325static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
326#endif
327
328/* Default task group.
329 * Every task in system belong to this group at bootup.
330 */
331struct task_group root_task_group;
332
333#endif /* CONFIG_CGROUP_SCHED */
334
335/* CFS-related fields in a runqueue */
336struct cfs_rq {
337 struct load_weight load;
338 unsigned long nr_running, h_nr_running;
339
340 u64 exec_clock;
341 u64 min_vruntime;
342#ifndef CONFIG_64BIT
343 u64 min_vruntime_copy;
344#endif
345
346 struct rb_root tasks_timeline;
347 struct rb_node *rb_leftmost;
348
349 struct list_head tasks;
350 struct list_head *balance_iterator;
351
352 /*
353 * 'curr' points to currently running entity on this cfs_rq.
354 * It is set to NULL otherwise (i.e when none are currently running).
355 */
356 struct sched_entity *curr, *next, *last, *skip;
357
358#ifdef CONFIG_SCHED_DEBUG
359 unsigned int nr_spread_over;
360#endif
361
362#ifdef CONFIG_FAIR_GROUP_SCHED
363 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
364
365 /*
366 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
367 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
368 * (like users, containers etc.)
369 *
370 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
371 * list is used during load balance.
372 */
373 int on_list;
374 struct list_head leaf_cfs_rq_list;
375 struct task_group *tg; /* group that "owns" this runqueue */
376
377#ifdef CONFIG_SMP
378 /*
379 * the part of load.weight contributed by tasks
380 */
381 unsigned long task_weight;
382
383 /*
384 * h_load = weight * f(tg)
385 *
386 * Where f(tg) is the recursive weight fraction assigned to
387 * this group.
388 */
389 unsigned long h_load;
390
391 /*
392 * Maintaining per-cpu shares distribution for group scheduling
393 *
394 * load_stamp is the last time we updated the load average
395 * load_last is the last time we updated the load average and saw load
396 * load_unacc_exec_time is currently unaccounted execution time
397 */
398 u64 load_avg;
399 u64 load_period;
400 u64 load_stamp, load_last, load_unacc_exec_time;
401
402 unsigned long load_contribution;
403#endif
404#ifdef CONFIG_CFS_BANDWIDTH
405 int runtime_enabled;
406 u64 runtime_expires;
407 s64 runtime_remaining;
408
409 u64 throttled_timestamp;
410 int throttled, throttle_count;
411 struct list_head throttled_list;
412#endif
413#endif
414};
415
416#ifdef CONFIG_FAIR_GROUP_SCHED
417#ifdef CONFIG_CFS_BANDWIDTH
418static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
419{
420 return &tg->cfs_bandwidth;
421}
422
423static inline u64 default_cfs_period(void);
424static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
425static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
426
427static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
428{
429 struct cfs_bandwidth *cfs_b =
430 container_of(timer, struct cfs_bandwidth, slack_timer);
431 do_sched_cfs_slack_timer(cfs_b);
432
433 return HRTIMER_NORESTART;
434}
435
436static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
437{
438 struct cfs_bandwidth *cfs_b =
439 container_of(timer, struct cfs_bandwidth, period_timer);
440 ktime_t now;
441 int overrun;
442 int idle = 0;
443
444 for (;;) {
445 now = hrtimer_cb_get_time(timer);
446 overrun = hrtimer_forward(timer, now, cfs_b->period);
447
448 if (!overrun)
449 break;
450
451 idle = do_sched_cfs_period_timer(cfs_b, overrun);
452 }
453
454 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
455}
456
457static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
458{
459 raw_spin_lock_init(&cfs_b->lock);
460 cfs_b->runtime = 0;
461 cfs_b->quota = RUNTIME_INF;
462 cfs_b->period = ns_to_ktime(default_cfs_period());
463
464 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
465 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
466 cfs_b->period_timer.function = sched_cfs_period_timer;
467 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
468 cfs_b->slack_timer.function = sched_cfs_slack_timer;
469}
470
471static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
472{
473 cfs_rq->runtime_enabled = 0;
474 INIT_LIST_HEAD(&cfs_rq->throttled_list);
475}
476
477/* requires cfs_b->lock, may release to reprogram timer */
478static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
479{
480 /*
481 * The timer may be active because we're trying to set a new bandwidth
482 * period or because we're racing with the tear-down path
483 * (timer_active==0 becomes visible before the hrtimer call-back
484 * terminates). In either case we ensure that it's re-programmed
485 */
486 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
487 raw_spin_unlock(&cfs_b->lock);
488 /* ensure cfs_b->lock is available while we wait */
489 hrtimer_cancel(&cfs_b->period_timer);
490
491 raw_spin_lock(&cfs_b->lock);
492 /* if someone else restarted the timer then we're done */
493 if (cfs_b->timer_active)
494 return;
495 }
496
497 cfs_b->timer_active = 1;
498 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
499}
500
501static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
502{
503 hrtimer_cancel(&cfs_b->period_timer);
504 hrtimer_cancel(&cfs_b->slack_timer);
505}
506#else
507static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
508static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
509static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
510
511static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
512{
513 return NULL;
514}
515#endif /* CONFIG_CFS_BANDWIDTH */
516#endif /* CONFIG_FAIR_GROUP_SCHED */
517
518/* Real-Time classes' related field in a runqueue: */
519struct rt_rq {
520 struct rt_prio_array active;
521 unsigned long rt_nr_running;
522#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
523 struct {
524 int curr; /* highest queued rt task prio */
525#ifdef CONFIG_SMP
526 int next; /* next highest */
527#endif
528 } highest_prio;
529#endif
530#ifdef CONFIG_SMP
531 unsigned long rt_nr_migratory;
532 unsigned long rt_nr_total;
533 int overloaded;
534 struct plist_head pushable_tasks;
535#endif
536 int rt_throttled;
537 u64 rt_time;
538 u64 rt_runtime;
539 /* Nests inside the rq lock: */
540 raw_spinlock_t rt_runtime_lock;
541
542#ifdef CONFIG_RT_GROUP_SCHED
543 unsigned long rt_nr_boosted;
544
545 struct rq *rq;
546 struct list_head leaf_rt_rq_list;
547 struct task_group *tg;
548#endif
549};
550
551#ifdef CONFIG_SMP
552
553/*
554 * We add the notion of a root-domain which will be used to define per-domain
555 * variables. Each exclusive cpuset essentially defines an island domain by
556 * fully partitioning the member cpus from any other cpuset. Whenever a new
557 * exclusive cpuset is created, we also create and attach a new root-domain
558 * object.
559 *
560 */
561struct root_domain {
562 atomic_t refcount;
563 atomic_t rto_count;
564 struct rcu_head rcu;
565 cpumask_var_t span;
566 cpumask_var_t online;
567
568 /*
569 * The "RT overload" flag: it gets set if a CPU has more than
570 * one runnable RT task.
571 */
572 cpumask_var_t rto_mask;
573 struct cpupri cpupri;
574};
575
576/*
577 * By default the system creates a single root-domain with all cpus as
578 * members (mimicking the global state we have today).
579 */
580static struct root_domain def_root_domain;
581
582#endif /* CONFIG_SMP */
583
584/*
585 * This is the main, per-CPU runqueue data structure.
586 *
587 * Locking rule: those places that want to lock multiple runqueues
588 * (such as the load balancing or the thread migration code), lock
589 * acquire operations must be ordered by ascending &runqueue.
590 */
591struct rq {
592 /* runqueue lock: */
593 raw_spinlock_t lock;
594
595 /*
596 * nr_running and cpu_load should be in the same cacheline because
597 * remote CPUs use both these fields when doing load calculation.
598 */
599 unsigned long nr_running;
600 #define CPU_LOAD_IDX_MAX 5
601 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
602 unsigned long last_load_update_tick;
603#ifdef CONFIG_NO_HZ
604 u64 nohz_stamp;
605 unsigned char nohz_balance_kick;
606#endif
607 int skip_clock_update;
608
609 /* capture load from *all* tasks on this cpu: */
610 struct load_weight load;
611 unsigned long nr_load_updates;
612 u64 nr_switches;
613
614 struct cfs_rq cfs;
615 struct rt_rq rt;
616
617#ifdef CONFIG_FAIR_GROUP_SCHED
618 /* list of leaf cfs_rq on this cpu: */
619 struct list_head leaf_cfs_rq_list;
620#endif
621#ifdef CONFIG_RT_GROUP_SCHED
622 struct list_head leaf_rt_rq_list;
623#endif
624
625 /*
626 * This is part of a global counter where only the total sum
627 * over all CPUs matters. A task can increase this counter on
628 * one CPU and if it got migrated afterwards it may decrease
629 * it on another CPU. Always updated under the runqueue lock:
630 */
631 unsigned long nr_uninterruptible;
632
633 struct task_struct *curr, *idle, *stop;
634 unsigned long next_balance;
635 struct mm_struct *prev_mm;
636
637 u64 clock;
638 u64 clock_task;
639
640 atomic_t nr_iowait;
641
642#ifdef CONFIG_SMP
643 struct root_domain *rd;
644 struct sched_domain *sd;
645
646 unsigned long cpu_power;
647
648 unsigned char idle_balance;
649 /* For active balancing */
650 int post_schedule;
651 int active_balance;
652 int push_cpu;
653 struct cpu_stop_work active_balance_work;
654 /* cpu of this runqueue: */
655 int cpu;
656 int online;
657
658 u64 rt_avg;
659 u64 age_stamp;
660 u64 idle_stamp;
661 u64 avg_idle;
662#endif
663
664#ifdef CONFIG_IRQ_TIME_ACCOUNTING
665 u64 prev_irq_time;
666#endif
667#ifdef CONFIG_PARAVIRT
668 u64 prev_steal_time;
669#endif
670#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
671 u64 prev_steal_time_rq;
672#endif
673
674 /* calc_load related fields */
675 unsigned long calc_load_update;
676 long calc_load_active;
677
678#ifdef CONFIG_SCHED_HRTICK
679#ifdef CONFIG_SMP
680 int hrtick_csd_pending;
681 struct call_single_data hrtick_csd;
682#endif
683 struct hrtimer hrtick_timer;
684#endif
685
686#ifdef CONFIG_SCHEDSTATS
687 /* latency stats */
688 struct sched_info rq_sched_info;
689 unsigned long long rq_cpu_time;
690 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
691
692 /* sys_sched_yield() stats */
693 unsigned int yld_count;
694
695 /* schedule() stats */
696 unsigned int sched_switch;
697 unsigned int sched_count;
698 unsigned int sched_goidle;
699
700 /* try_to_wake_up() stats */
701 unsigned int ttwu_count;
702 unsigned int ttwu_local;
703#endif
704
705#ifdef CONFIG_SMP
706 struct llist_head wake_list;
707#endif
708};
709
710static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
711
712
713static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
714
715static inline int cpu_of(struct rq *rq)
716{
717#ifdef CONFIG_SMP
718 return rq->cpu;
719#else
720 return 0;
721#endif
722}
723
724#define rcu_dereference_check_sched_domain(p) \
725 rcu_dereference_check((p), \
726 lockdep_is_held(&sched_domains_mutex))
727
728/*
729 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
730 * See detach_destroy_domains: synchronize_sched for details.
731 *
732 * The domain tree of any CPU may only be accessed from within
733 * preempt-disabled sections.
734 */
735#define for_each_domain(cpu, __sd) \
736 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
737
738#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
739#define this_rq() (&__get_cpu_var(runqueues))
740#define task_rq(p) cpu_rq(task_cpu(p))
741#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
742#define raw_rq() (&__raw_get_cpu_var(runqueues))
743
744#ifdef CONFIG_CGROUP_SCHED
745
746/*
747 * Return the group to which this tasks belongs.
748 *
749 * We use task_subsys_state_check() and extend the RCU verification with
750 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
751 * task it moves into the cgroup. Therefore by holding either of those locks,
752 * we pin the task to the current cgroup.
753 */
754static inline struct task_group *task_group(struct task_struct *p)
755{
756 struct task_group *tg;
757 struct cgroup_subsys_state *css;
758
759 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
760 lockdep_is_held(&p->pi_lock) ||
761 lockdep_is_held(&task_rq(p)->lock));
762 tg = container_of(css, struct task_group, css);
763
764 return autogroup_task_group(p, tg);
765}
766
767/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
768static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
769{
770#ifdef CONFIG_FAIR_GROUP_SCHED
771 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
772 p->se.parent = task_group(p)->se[cpu];
773#endif
774
775#ifdef CONFIG_RT_GROUP_SCHED
776 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
777 p->rt.parent = task_group(p)->rt_se[cpu];
778#endif
779}
780
781#else /* CONFIG_CGROUP_SCHED */
782
783static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
784static inline struct task_group *task_group(struct task_struct *p)
785{
786 return NULL;
787}
788
789#endif /* CONFIG_CGROUP_SCHED */
790 109
791static void update_rq_clock_task(struct rq *rq, s64 delta); 110static void update_rq_clock_task(struct rq *rq, s64 delta);
792 111
793static void update_rq_clock(struct rq *rq) 112void update_rq_clock(struct rq *rq)
794{ 113{
795 s64 delta; 114 s64 delta;
796 115
@@ -803,44 +122,14 @@ static void update_rq_clock(struct rq *rq)
803} 122}
804 123
805/* 124/*
806 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
807 */
808#ifdef CONFIG_SCHED_DEBUG
809# define const_debug __read_mostly
810#else
811# define const_debug static const
812#endif
813
814/**
815 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
816 * @cpu: the processor in question.
817 *
818 * This interface allows printk to be called with the runqueue lock
819 * held and know whether or not it is OK to wake up the klogd.
820 */
821int runqueue_is_locked(int cpu)
822{
823 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
824}
825
826/*
827 * Debugging: various feature bits 125 * Debugging: various feature bits
828 */ 126 */
829 127
830#define SCHED_FEAT(name, enabled) \ 128#define SCHED_FEAT(name, enabled) \
831 __SCHED_FEAT_##name ,
832
833enum {
834#include "sched_features.h"
835};
836
837#undef SCHED_FEAT
838
839#define SCHED_FEAT(name, enabled) \
840 (1UL << __SCHED_FEAT_##name) * enabled | 129 (1UL << __SCHED_FEAT_##name) * enabled |
841 130
842const_debug unsigned int sysctl_sched_features = 131const_debug unsigned int sysctl_sched_features =
843#include "sched_features.h" 132#include "features.h"
844 0; 133 0;
845 134
846#undef SCHED_FEAT 135#undef SCHED_FEAT
@@ -850,7 +139,7 @@ const_debug unsigned int sysctl_sched_features =
850 #name , 139 #name ,
851 140
852static __read_mostly char *sched_feat_names[] = { 141static __read_mostly char *sched_feat_names[] = {
853#include "sched_features.h" 142#include "features.h"
854 NULL 143 NULL
855}; 144};
856 145
@@ -860,7 +149,7 @@ static int sched_feat_show(struct seq_file *m, void *v)
860{ 149{
861 int i; 150 int i;
862 151
863 for (i = 0; sched_feat_names[i]; i++) { 152 for (i = 0; i < __SCHED_FEAT_NR; i++) {
864 if (!(sysctl_sched_features & (1UL << i))) 153 if (!(sysctl_sched_features & (1UL << i)))
865 seq_puts(m, "NO_"); 154 seq_puts(m, "NO_");
866 seq_printf(m, "%s ", sched_feat_names[i]); 155 seq_printf(m, "%s ", sched_feat_names[i]);
@@ -870,6 +159,36 @@ static int sched_feat_show(struct seq_file *m, void *v)
870 return 0; 159 return 0;
871} 160}
872 161
162#ifdef HAVE_JUMP_LABEL
163
164#define jump_label_key__true jump_label_key_enabled
165#define jump_label_key__false jump_label_key_disabled
166
167#define SCHED_FEAT(name, enabled) \
168 jump_label_key__##enabled ,
169
170struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
171#include "features.h"
172};
173
174#undef SCHED_FEAT
175
176static void sched_feat_disable(int i)
177{
178 if (jump_label_enabled(&sched_feat_keys[i]))
179 jump_label_dec(&sched_feat_keys[i]);
180}
181
182static void sched_feat_enable(int i)
183{
184 if (!jump_label_enabled(&sched_feat_keys[i]))
185 jump_label_inc(&sched_feat_keys[i]);
186}
187#else
188static void sched_feat_disable(int i) { };
189static void sched_feat_enable(int i) { };
190#endif /* HAVE_JUMP_LABEL */
191
873static ssize_t 192static ssize_t
874sched_feat_write(struct file *filp, const char __user *ubuf, 193sched_feat_write(struct file *filp, const char __user *ubuf,
875 size_t cnt, loff_t *ppos) 194 size_t cnt, loff_t *ppos)
@@ -893,17 +212,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
893 cmp += 3; 212 cmp += 3;
894 } 213 }
895 214
896 for (i = 0; sched_feat_names[i]; i++) { 215 for (i = 0; i < __SCHED_FEAT_NR; i++) {
897 if (strcmp(cmp, sched_feat_names[i]) == 0) { 216 if (strcmp(cmp, sched_feat_names[i]) == 0) {
898 if (neg) 217 if (neg) {
899 sysctl_sched_features &= ~(1UL << i); 218 sysctl_sched_features &= ~(1UL << i);
900 else 219 sched_feat_disable(i);
220 } else {
901 sysctl_sched_features |= (1UL << i); 221 sysctl_sched_features |= (1UL << i);
222 sched_feat_enable(i);
223 }
902 break; 224 break;
903 } 225 }
904 } 226 }
905 227
906 if (!sched_feat_names[i]) 228 if (i == __SCHED_FEAT_NR)
907 return -EINVAL; 229 return -EINVAL;
908 230
909 *ppos += cnt; 231 *ppos += cnt;
@@ -932,10 +254,7 @@ static __init int sched_init_debug(void)
932 return 0; 254 return 0;
933} 255}
934late_initcall(sched_init_debug); 256late_initcall(sched_init_debug);
935 257#endif /* CONFIG_SCHED_DEBUG */
936#endif
937
938#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
939 258
940/* 259/*
941 * Number of tasks to iterate in a single balance run. 260 * Number of tasks to iterate in a single balance run.
@@ -957,7 +276,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
957 */ 276 */
958unsigned int sysctl_sched_rt_period = 1000000; 277unsigned int sysctl_sched_rt_period = 1000000;
959 278
960static __read_mostly int scheduler_running; 279__read_mostly int scheduler_running;
961 280
962/* 281/*
963 * part of the period that we allow rt tasks to run in us. 282 * part of the period that we allow rt tasks to run in us.
@@ -965,112 +284,7 @@ static __read_mostly int scheduler_running;
965 */ 284 */
966int sysctl_sched_rt_runtime = 950000; 285int sysctl_sched_rt_runtime = 950000;
967 286
968static inline u64 global_rt_period(void)
969{
970 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
971}
972
973static inline u64 global_rt_runtime(void)
974{
975 if (sysctl_sched_rt_runtime < 0)
976 return RUNTIME_INF;
977
978 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
979}
980
981#ifndef prepare_arch_switch
982# define prepare_arch_switch(next) do { } while (0)
983#endif
984#ifndef finish_arch_switch
985# define finish_arch_switch(prev) do { } while (0)
986#endif
987
988static inline int task_current(struct rq *rq, struct task_struct *p)
989{
990 return rq->curr == p;
991}
992
993static inline int task_running(struct rq *rq, struct task_struct *p)
994{
995#ifdef CONFIG_SMP
996 return p->on_cpu;
997#else
998 return task_current(rq, p);
999#endif
1000}
1001
1002#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1003static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1004{
1005#ifdef CONFIG_SMP
1006 /*
1007 * We can optimise this out completely for !SMP, because the
1008 * SMP rebalancing from interrupt is the only thing that cares
1009 * here.
1010 */
1011 next->on_cpu = 1;
1012#endif
1013}
1014
1015static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1016{
1017#ifdef CONFIG_SMP
1018 /*
1019 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1020 * We must ensure this doesn't happen until the switch is completely
1021 * finished.
1022 */
1023 smp_wmb();
1024 prev->on_cpu = 0;
1025#endif
1026#ifdef CONFIG_DEBUG_SPINLOCK
1027 /* this is a valid case when another task releases the spinlock */
1028 rq->lock.owner = current;
1029#endif
1030 /*
1031 * If we are tracking spinlock dependencies then we have to
1032 * fix up the runqueue lock - which gets 'carried over' from
1033 * prev into current:
1034 */
1035 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1036
1037 raw_spin_unlock_irq(&rq->lock);
1038}
1039 287
1040#else /* __ARCH_WANT_UNLOCKED_CTXSW */
1041static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1042{
1043#ifdef CONFIG_SMP
1044 /*
1045 * We can optimise this out completely for !SMP, because the
1046 * SMP rebalancing from interrupt is the only thing that cares
1047 * here.
1048 */
1049 next->on_cpu = 1;
1050#endif
1051#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1052 raw_spin_unlock_irq(&rq->lock);
1053#else
1054 raw_spin_unlock(&rq->lock);
1055#endif
1056}
1057
1058static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1059{
1060#ifdef CONFIG_SMP
1061 /*
1062 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1063 * We must ensure this doesn't happen until the switch is completely
1064 * finished.
1065 */
1066 smp_wmb();
1067 prev->on_cpu = 0;
1068#endif
1069#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1070 local_irq_enable();
1071#endif
1072}
1073#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1074 288
1075/* 289/*
1076 * __task_rq_lock - lock the rq @p resides on. 290 * __task_rq_lock - lock the rq @p resides on.
@@ -1153,20 +367,6 @@ static struct rq *this_rq_lock(void)
1153 * rq->lock. 367 * rq->lock.
1154 */ 368 */
1155 369
1156/*
1157 * Use hrtick when:
1158 * - enabled by features
1159 * - hrtimer is actually high res
1160 */
1161static inline int hrtick_enabled(struct rq *rq)
1162{
1163 if (!sched_feat(HRTICK))
1164 return 0;
1165 if (!cpu_active(cpu_of(rq)))
1166 return 0;
1167 return hrtimer_is_hres_active(&rq->hrtick_timer);
1168}
1169
1170static void hrtick_clear(struct rq *rq) 370static void hrtick_clear(struct rq *rq)
1171{ 371{
1172 if (hrtimer_active(&rq->hrtick_timer)) 372 if (hrtimer_active(&rq->hrtick_timer))
@@ -1210,7 +410,7 @@ static void __hrtick_start(void *arg)
1210 * 410 *
1211 * called with rq->lock held and irqs disabled 411 * called with rq->lock held and irqs disabled
1212 */ 412 */
1213static void hrtick_start(struct rq *rq, u64 delay) 413void hrtick_start(struct rq *rq, u64 delay)
1214{ 414{
1215 struct hrtimer *timer = &rq->hrtick_timer; 415 struct hrtimer *timer = &rq->hrtick_timer;
1216 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 416 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
@@ -1254,7 +454,7 @@ static __init void init_hrtick(void)
1254 * 454 *
1255 * called with rq->lock held and irqs disabled 455 * called with rq->lock held and irqs disabled
1256 */ 456 */
1257static void hrtick_start(struct rq *rq, u64 delay) 457void hrtick_start(struct rq *rq, u64 delay)
1258{ 458{
1259 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 459 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1260 HRTIMER_MODE_REL_PINNED, 0); 460 HRTIMER_MODE_REL_PINNED, 0);
@@ -1305,7 +505,7 @@ static inline void init_hrtick(void)
1305#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 505#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1306#endif 506#endif
1307 507
1308static void resched_task(struct task_struct *p) 508void resched_task(struct task_struct *p)
1309{ 509{
1310 int cpu; 510 int cpu;
1311 511
@@ -1326,7 +526,7 @@ static void resched_task(struct task_struct *p)
1326 smp_send_reschedule(cpu); 526 smp_send_reschedule(cpu);
1327} 527}
1328 528
1329static void resched_cpu(int cpu) 529void resched_cpu(int cpu)
1330{ 530{
1331 struct rq *rq = cpu_rq(cpu); 531 struct rq *rq = cpu_rq(cpu);
1332 unsigned long flags; 532 unsigned long flags;
@@ -1407,7 +607,8 @@ void wake_up_idle_cpu(int cpu)
1407 607
1408static inline bool got_nohz_idle_kick(void) 608static inline bool got_nohz_idle_kick(void)
1409{ 609{
1410 return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; 610 int cpu = smp_processor_id();
611 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
1411} 612}
1412 613
1413#else /* CONFIG_NO_HZ */ 614#else /* CONFIG_NO_HZ */
@@ -1419,12 +620,7 @@ static inline bool got_nohz_idle_kick(void)
1419 620
1420#endif /* CONFIG_NO_HZ */ 621#endif /* CONFIG_NO_HZ */
1421 622
1422static u64 sched_avg_period(void) 623void sched_avg_update(struct rq *rq)
1423{
1424 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1425}
1426
1427static void sched_avg_update(struct rq *rq)
1428{ 624{
1429 s64 period = sched_avg_period(); 625 s64 period = sched_avg_period();
1430 626
@@ -1440,193 +636,23 @@ static void sched_avg_update(struct rq *rq)
1440 } 636 }
1441} 637}
1442 638
1443static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1444{
1445 rq->rt_avg += rt_delta;
1446 sched_avg_update(rq);
1447}
1448
1449#else /* !CONFIG_SMP */ 639#else /* !CONFIG_SMP */
1450static void resched_task(struct task_struct *p) 640void resched_task(struct task_struct *p)
1451{ 641{
1452 assert_raw_spin_locked(&task_rq(p)->lock); 642 assert_raw_spin_locked(&task_rq(p)->lock);
1453 set_tsk_need_resched(p); 643 set_tsk_need_resched(p);
1454} 644}
1455
1456static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1457{
1458}
1459
1460static void sched_avg_update(struct rq *rq)
1461{
1462}
1463#endif /* CONFIG_SMP */ 645#endif /* CONFIG_SMP */
1464 646
1465#if BITS_PER_LONG == 32
1466# define WMULT_CONST (~0UL)
1467#else
1468# define WMULT_CONST (1UL << 32)
1469#endif
1470
1471#define WMULT_SHIFT 32
1472
1473/*
1474 * Shift right and round:
1475 */
1476#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1477
1478/*
1479 * delta *= weight / lw
1480 */
1481static unsigned long
1482calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1483 struct load_weight *lw)
1484{
1485 u64 tmp;
1486
1487 /*
1488 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1489 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1490 * 2^SCHED_LOAD_RESOLUTION.
1491 */
1492 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1493 tmp = (u64)delta_exec * scale_load_down(weight);
1494 else
1495 tmp = (u64)delta_exec;
1496
1497 if (!lw->inv_weight) {
1498 unsigned long w = scale_load_down(lw->weight);
1499
1500 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1501 lw->inv_weight = 1;
1502 else if (unlikely(!w))
1503 lw->inv_weight = WMULT_CONST;
1504 else
1505 lw->inv_weight = WMULT_CONST / w;
1506 }
1507
1508 /*
1509 * Check whether we'd overflow the 64-bit multiplication:
1510 */
1511 if (unlikely(tmp > WMULT_CONST))
1512 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1513 WMULT_SHIFT/2);
1514 else
1515 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1516
1517 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1518}
1519
1520static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1521{
1522 lw->weight += inc;
1523 lw->inv_weight = 0;
1524}
1525
1526static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1527{
1528 lw->weight -= dec;
1529 lw->inv_weight = 0;
1530}
1531
1532static inline void update_load_set(struct load_weight *lw, unsigned long w)
1533{
1534 lw->weight = w;
1535 lw->inv_weight = 0;
1536}
1537
1538/*
1539 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1540 * of tasks with abnormal "nice" values across CPUs the contribution that
1541 * each task makes to its run queue's load is weighted according to its
1542 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1543 * scaled version of the new time slice allocation that they receive on time
1544 * slice expiry etc.
1545 */
1546
1547#define WEIGHT_IDLEPRIO 3
1548#define WMULT_IDLEPRIO 1431655765
1549
1550/*
1551 * Nice levels are multiplicative, with a gentle 10% change for every
1552 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1553 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1554 * that remained on nice 0.
1555 *
1556 * The "10% effect" is relative and cumulative: from _any_ nice level,
1557 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1558 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1559 * If a task goes up by ~10% and another task goes down by ~10% then
1560 * the relative distance between them is ~25%.)
1561 */
1562static const int prio_to_weight[40] = {
1563 /* -20 */ 88761, 71755, 56483, 46273, 36291,
1564 /* -15 */ 29154, 23254, 18705, 14949, 11916,
1565 /* -10 */ 9548, 7620, 6100, 4904, 3906,
1566 /* -5 */ 3121, 2501, 1991, 1586, 1277,
1567 /* 0 */ 1024, 820, 655, 526, 423,
1568 /* 5 */ 335, 272, 215, 172, 137,
1569 /* 10 */ 110, 87, 70, 56, 45,
1570 /* 15 */ 36, 29, 23, 18, 15,
1571};
1572
1573/*
1574 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1575 *
1576 * In cases where the weight does not change often, we can use the
1577 * precalculated inverse to speed up arithmetics by turning divisions
1578 * into multiplications:
1579 */
1580static const u32 prio_to_wmult[40] = {
1581 /* -20 */ 48388, 59856, 76040, 92818, 118348,
1582 /* -15 */ 147320, 184698, 229616, 287308, 360437,
1583 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
1584 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1585 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1586 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1587 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1588 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1589};
1590
1591/* Time spent by the tasks of the cpu accounting group executing in ... */
1592enum cpuacct_stat_index {
1593 CPUACCT_STAT_USER, /* ... user mode */
1594 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1595
1596 CPUACCT_STAT_NSTATS,
1597};
1598
1599#ifdef CONFIG_CGROUP_CPUACCT
1600static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1601static void cpuacct_update_stats(struct task_struct *tsk,
1602 enum cpuacct_stat_index idx, cputime_t val);
1603#else
1604static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1605static inline void cpuacct_update_stats(struct task_struct *tsk,
1606 enum cpuacct_stat_index idx, cputime_t val) {}
1607#endif
1608
1609static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1610{
1611 update_load_add(&rq->load, load);
1612}
1613
1614static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1615{
1616 update_load_sub(&rq->load, load);
1617}
1618
1619#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 647#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1620 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 648 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1621typedef int (*tg_visitor)(struct task_group *, void *);
1622
1623/* 649/*
1624 * Iterate task_group tree rooted at *from, calling @down when first entering a 650 * Iterate task_group tree rooted at *from, calling @down when first entering a
1625 * node and @up when leaving it for the final time. 651 * node and @up when leaving it for the final time.
1626 * 652 *
1627 * Caller must hold rcu_lock or sufficient equivalent. 653 * Caller must hold rcu_lock or sufficient equivalent.
1628 */ 654 */
1629static int walk_tg_tree_from(struct task_group *from, 655int walk_tg_tree_from(struct task_group *from,
1630 tg_visitor down, tg_visitor up, void *data) 656 tg_visitor down, tg_visitor up, void *data)
1631{ 657{
1632 struct task_group *parent, *child; 658 struct task_group *parent, *child;
@@ -1657,270 +683,13 @@ out:
1657 return ret; 683 return ret;
1658} 684}
1659 685
1660/* 686int tg_nop(struct task_group *tg, void *data)
1661 * Iterate the full tree, calling @down when first entering a node and @up when
1662 * leaving it for the final time.
1663 *
1664 * Caller must hold rcu_lock or sufficient equivalent.
1665 */
1666
1667static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1668{
1669 return walk_tg_tree_from(&root_task_group, down, up, data);
1670}
1671
1672static int tg_nop(struct task_group *tg, void *data)
1673{
1674 return 0;
1675}
1676#endif
1677
1678#ifdef CONFIG_SMP
1679/* Used instead of source_load when we know the type == 0 */
1680static unsigned long weighted_cpuload(const int cpu)
1681{
1682 return cpu_rq(cpu)->load.weight;
1683}
1684
1685/*
1686 * Return a low guess at the load of a migration-source cpu weighted
1687 * according to the scheduling class and "nice" value.
1688 *
1689 * We want to under-estimate the load of migration sources, to
1690 * balance conservatively.
1691 */
1692static unsigned long source_load(int cpu, int type)
1693{
1694 struct rq *rq = cpu_rq(cpu);
1695 unsigned long total = weighted_cpuload(cpu);
1696
1697 if (type == 0 || !sched_feat(LB_BIAS))
1698 return total;
1699
1700 return min(rq->cpu_load[type-1], total);
1701}
1702
1703/*
1704 * Return a high guess at the load of a migration-target cpu weighted
1705 * according to the scheduling class and "nice" value.
1706 */
1707static unsigned long target_load(int cpu, int type)
1708{
1709 struct rq *rq = cpu_rq(cpu);
1710 unsigned long total = weighted_cpuload(cpu);
1711
1712 if (type == 0 || !sched_feat(LB_BIAS))
1713 return total;
1714
1715 return max(rq->cpu_load[type-1], total);
1716}
1717
1718static unsigned long power_of(int cpu)
1719{
1720 return cpu_rq(cpu)->cpu_power;
1721}
1722
1723static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1724
1725static unsigned long cpu_avg_load_per_task(int cpu)
1726{ 687{
1727 struct rq *rq = cpu_rq(cpu);
1728 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1729
1730 if (nr_running)
1731 return rq->load.weight / nr_running;
1732
1733 return 0; 688 return 0;
1734} 689}
1735
1736#ifdef CONFIG_PREEMPT
1737
1738static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1739
1740/*
1741 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1742 * way at the expense of forcing extra atomic operations in all
1743 * invocations. This assures that the double_lock is acquired using the
1744 * same underlying policy as the spinlock_t on this architecture, which
1745 * reduces latency compared to the unfair variant below. However, it
1746 * also adds more overhead and therefore may reduce throughput.
1747 */
1748static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1749 __releases(this_rq->lock)
1750 __acquires(busiest->lock)
1751 __acquires(this_rq->lock)
1752{
1753 raw_spin_unlock(&this_rq->lock);
1754 double_rq_lock(this_rq, busiest);
1755
1756 return 1;
1757}
1758
1759#else
1760/*
1761 * Unfair double_lock_balance: Optimizes throughput at the expense of
1762 * latency by eliminating extra atomic operations when the locks are
1763 * already in proper order on entry. This favors lower cpu-ids and will
1764 * grant the double lock to lower cpus over higher ids under contention,
1765 * regardless of entry order into the function.
1766 */
1767static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1768 __releases(this_rq->lock)
1769 __acquires(busiest->lock)
1770 __acquires(this_rq->lock)
1771{
1772 int ret = 0;
1773
1774 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1775 if (busiest < this_rq) {
1776 raw_spin_unlock(&this_rq->lock);
1777 raw_spin_lock(&busiest->lock);
1778 raw_spin_lock_nested(&this_rq->lock,
1779 SINGLE_DEPTH_NESTING);
1780 ret = 1;
1781 } else
1782 raw_spin_lock_nested(&busiest->lock,
1783 SINGLE_DEPTH_NESTING);
1784 }
1785 return ret;
1786}
1787
1788#endif /* CONFIG_PREEMPT */
1789
1790/*
1791 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1792 */
1793static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1794{
1795 if (unlikely(!irqs_disabled())) {
1796 /* printk() doesn't work good under rq->lock */
1797 raw_spin_unlock(&this_rq->lock);
1798 BUG_ON(1);
1799 }
1800
1801 return _double_lock_balance(this_rq, busiest);
1802}
1803
1804static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 __releases(busiest->lock)
1806{
1807 raw_spin_unlock(&busiest->lock);
1808 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1809}
1810
1811/*
1812 * double_rq_lock - safely lock two runqueues
1813 *
1814 * Note this does not disable interrupts like task_rq_lock,
1815 * you need to do so manually before calling.
1816 */
1817static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1818 __acquires(rq1->lock)
1819 __acquires(rq2->lock)
1820{
1821 BUG_ON(!irqs_disabled());
1822 if (rq1 == rq2) {
1823 raw_spin_lock(&rq1->lock);
1824 __acquire(rq2->lock); /* Fake it out ;) */
1825 } else {
1826 if (rq1 < rq2) {
1827 raw_spin_lock(&rq1->lock);
1828 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1829 } else {
1830 raw_spin_lock(&rq2->lock);
1831 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1832 }
1833 }
1834}
1835
1836/*
1837 * double_rq_unlock - safely unlock two runqueues
1838 *
1839 * Note this does not restore interrupts like task_rq_unlock,
1840 * you need to do so manually after calling.
1841 */
1842static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1843 __releases(rq1->lock)
1844 __releases(rq2->lock)
1845{
1846 raw_spin_unlock(&rq1->lock);
1847 if (rq1 != rq2)
1848 raw_spin_unlock(&rq2->lock);
1849 else
1850 __release(rq2->lock);
1851}
1852
1853#else /* CONFIG_SMP */
1854
1855/*
1856 * double_rq_lock - safely lock two runqueues
1857 *
1858 * Note this does not disable interrupts like task_rq_lock,
1859 * you need to do so manually before calling.
1860 */
1861static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1862 __acquires(rq1->lock)
1863 __acquires(rq2->lock)
1864{
1865 BUG_ON(!irqs_disabled());
1866 BUG_ON(rq1 != rq2);
1867 raw_spin_lock(&rq1->lock);
1868 __acquire(rq2->lock); /* Fake it out ;) */
1869}
1870
1871/*
1872 * double_rq_unlock - safely unlock two runqueues
1873 *
1874 * Note this does not restore interrupts like task_rq_unlock,
1875 * you need to do so manually after calling.
1876 */
1877static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1878 __releases(rq1->lock)
1879 __releases(rq2->lock)
1880{
1881 BUG_ON(rq1 != rq2);
1882 raw_spin_unlock(&rq1->lock);
1883 __release(rq2->lock);
1884}
1885
1886#endif 690#endif
1887 691
1888static void calc_load_account_idle(struct rq *this_rq); 692void update_cpu_load(struct rq *this_rq);
1889static void update_sysctl(void);
1890static int get_update_sysctl_factor(void);
1891static void update_cpu_load(struct rq *this_rq);
1892
1893static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1894{
1895 set_task_rq(p, cpu);
1896#ifdef CONFIG_SMP
1897 /*
1898 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1899 * successfully executed on another CPU. We must ensure that updates of
1900 * per-task data have been completed by this moment.
1901 */
1902 smp_wmb();
1903 task_thread_info(p)->cpu = cpu;
1904#endif
1905}
1906
1907static const struct sched_class rt_sched_class;
1908
1909#define sched_class_highest (&stop_sched_class)
1910#define for_each_class(class) \
1911 for (class = sched_class_highest; class; class = class->next)
1912
1913#include "sched_stats.h"
1914
1915static void inc_nr_running(struct rq *rq)
1916{
1917 rq->nr_running++;
1918}
1919
1920static void dec_nr_running(struct rq *rq)
1921{
1922 rq->nr_running--;
1923}
1924 693
1925static void set_load_weight(struct task_struct *p) 694static void set_load_weight(struct task_struct *p)
1926{ 695{
@@ -1957,7 +726,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1957/* 726/*
1958 * activate_task - move a task to the runqueue. 727 * activate_task - move a task to the runqueue.
1959 */ 728 */
1960static void activate_task(struct rq *rq, struct task_struct *p, int flags) 729void activate_task(struct rq *rq, struct task_struct *p, int flags)
1961{ 730{
1962 if (task_contributes_to_load(p)) 731 if (task_contributes_to_load(p))
1963 rq->nr_uninterruptible--; 732 rq->nr_uninterruptible--;
@@ -1968,7 +737,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1968/* 737/*
1969 * deactivate_task - remove a task from the runqueue. 738 * deactivate_task - remove a task from the runqueue.
1970 */ 739 */
1971static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 740void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1972{ 741{
1973 if (task_contributes_to_load(p)) 742 if (task_contributes_to_load(p))
1974 rq->nr_uninterruptible++; 743 rq->nr_uninterruptible++;
@@ -2159,14 +928,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
2159#ifdef CONFIG_IRQ_TIME_ACCOUNTING 928#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2160static int irqtime_account_hi_update(void) 929static int irqtime_account_hi_update(void)
2161{ 930{
2162 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 931 u64 *cpustat = kcpustat_this_cpu->cpustat;
2163 unsigned long flags; 932 unsigned long flags;
2164 u64 latest_ns; 933 u64 latest_ns;
2165 int ret = 0; 934 int ret = 0;
2166 935
2167 local_irq_save(flags); 936 local_irq_save(flags);
2168 latest_ns = this_cpu_read(cpu_hardirq_time); 937 latest_ns = this_cpu_read(cpu_hardirq_time);
2169 if (nsecs_to_cputime64(latest_ns) > cpustat->irq) 938 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
2170 ret = 1; 939 ret = 1;
2171 local_irq_restore(flags); 940 local_irq_restore(flags);
2172 return ret; 941 return ret;
@@ -2174,14 +943,14 @@ static int irqtime_account_hi_update(void)
2174 943
2175static int irqtime_account_si_update(void) 944static int irqtime_account_si_update(void)
2176{ 945{
2177 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 946 u64 *cpustat = kcpustat_this_cpu->cpustat;
2178 unsigned long flags; 947 unsigned long flags;
2179 u64 latest_ns; 948 u64 latest_ns;
2180 int ret = 0; 949 int ret = 0;
2181 950
2182 local_irq_save(flags); 951 local_irq_save(flags);
2183 latest_ns = this_cpu_read(cpu_softirq_time); 952 latest_ns = this_cpu_read(cpu_softirq_time);
2184 if (nsecs_to_cputime64(latest_ns) > cpustat->softirq) 953 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
2185 ret = 1; 954 ret = 1;
2186 local_irq_restore(flags); 955 local_irq_restore(flags);
2187 return ret; 956 return ret;
@@ -2193,15 +962,6 @@ static int irqtime_account_si_update(void)
2193 962
2194#endif 963#endif
2195 964
2196#include "sched_idletask.c"
2197#include "sched_fair.c"
2198#include "sched_rt.c"
2199#include "sched_autogroup.c"
2200#include "sched_stoptask.c"
2201#ifdef CONFIG_SCHED_DEBUG
2202# include "sched_debug.c"
2203#endif
2204
2205void sched_set_stop_task(int cpu, struct task_struct *stop) 965void sched_set_stop_task(int cpu, struct task_struct *stop)
2206{ 966{
2207 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 967 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -2299,7 +1059,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2299 p->sched_class->prio_changed(rq, p, oldprio); 1059 p->sched_class->prio_changed(rq, p, oldprio);
2300} 1060}
2301 1061
2302static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 1062void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2303{ 1063{
2304 const struct sched_class *class; 1064 const struct sched_class *class;
2305 1065
@@ -2325,38 +1085,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2325} 1085}
2326 1086
2327#ifdef CONFIG_SMP 1087#ifdef CONFIG_SMP
2328/*
2329 * Is this task likely cache-hot:
2330 */
2331static int
2332task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2333{
2334 s64 delta;
2335
2336 if (p->sched_class != &fair_sched_class)
2337 return 0;
2338
2339 if (unlikely(p->policy == SCHED_IDLE))
2340 return 0;
2341
2342 /*
2343 * Buddy candidates are cache hot:
2344 */
2345 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2346 (&p->se == cfs_rq_of(&p->se)->next ||
2347 &p->se == cfs_rq_of(&p->se)->last))
2348 return 1;
2349
2350 if (sysctl_sched_migration_cost == -1)
2351 return 1;
2352 if (sysctl_sched_migration_cost == 0)
2353 return 0;
2354
2355 delta = now - p->se.exec_start;
2356
2357 return delta < (s64)sysctl_sched_migration_cost;
2358}
2359
2360void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1088void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2361{ 1089{
2362#ifdef CONFIG_SCHED_DEBUG 1090#ifdef CONFIG_SCHED_DEBUG
@@ -3439,7 +2167,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
3439 */ 2167 */
3440static atomic_long_t calc_load_tasks_idle; 2168static atomic_long_t calc_load_tasks_idle;
3441 2169
3442static void calc_load_account_idle(struct rq *this_rq) 2170void calc_load_account_idle(struct rq *this_rq)
3443{ 2171{
3444 long delta; 2172 long delta;
3445 2173
@@ -3583,7 +2311,7 @@ static void calc_global_nohz(unsigned long ticks)
3583 */ 2311 */
3584} 2312}
3585#else 2313#else
3586static void calc_load_account_idle(struct rq *this_rq) 2314void calc_load_account_idle(struct rq *this_rq)
3587{ 2315{
3588} 2316}
3589 2317
@@ -3726,7 +2454,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3726 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2454 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3727 * every tick. We fix it up based on jiffies. 2455 * every tick. We fix it up based on jiffies.
3728 */ 2456 */
3729static void update_cpu_load(struct rq *this_rq) 2457void update_cpu_load(struct rq *this_rq)
3730{ 2458{
3731 unsigned long this_load = this_rq->load.weight; 2459 unsigned long this_load = this_rq->load.weight;
3732 unsigned long curr_jiffies = jiffies; 2460 unsigned long curr_jiffies = jiffies;
@@ -3804,8 +2532,10 @@ unlock:
3804#endif 2532#endif
3805 2533
3806DEFINE_PER_CPU(struct kernel_stat, kstat); 2534DEFINE_PER_CPU(struct kernel_stat, kstat);
2535DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
3807 2536
3808EXPORT_PER_CPU_SYMBOL(kstat); 2537EXPORT_PER_CPU_SYMBOL(kstat);
2538EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
3809 2539
3810/* 2540/*
3811 * Return any ns on the sched_clock that have not yet been accounted in 2541 * Return any ns on the sched_clock that have not yet been accounted in
@@ -3858,6 +2588,42 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3858 return ns; 2588 return ns;
3859} 2589}
3860 2590
2591#ifdef CONFIG_CGROUP_CPUACCT
2592struct cgroup_subsys cpuacct_subsys;
2593struct cpuacct root_cpuacct;
2594#endif
2595
2596static inline void task_group_account_field(struct task_struct *p, int index,
2597 u64 tmp)
2598{
2599#ifdef CONFIG_CGROUP_CPUACCT
2600 struct kernel_cpustat *kcpustat;
2601 struct cpuacct *ca;
2602#endif
2603 /*
2604 * Since all updates are sure to touch the root cgroup, we
2605 * get ourselves ahead and touch it first. If the root cgroup
2606 * is the only cgroup, then nothing else should be necessary.
2607 *
2608 */
2609 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2610
2611#ifdef CONFIG_CGROUP_CPUACCT
2612 if (unlikely(!cpuacct_subsys.active))
2613 return;
2614
2615 rcu_read_lock();
2616 ca = task_ca(p);
2617 while (ca && (ca != &root_cpuacct)) {
2618 kcpustat = this_cpu_ptr(ca->cpustat);
2619 kcpustat->cpustat[index] += tmp;
2620 ca = parent_ca(ca);
2621 }
2622 rcu_read_unlock();
2623#endif
2624}
2625
2626
3861/* 2627/*
3862 * Account user cpu time to a process. 2628 * Account user cpu time to a process.
3863 * @p: the process that the cpu time gets accounted to 2629 * @p: the process that the cpu time gets accounted to
@@ -3867,20 +2633,18 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3867void account_user_time(struct task_struct *p, cputime_t cputime, 2633void account_user_time(struct task_struct *p, cputime_t cputime,
3868 cputime_t cputime_scaled) 2634 cputime_t cputime_scaled)
3869{ 2635{
3870 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2636 int index;
3871 2637
3872 /* Add user time to process. */ 2638 /* Add user time to process. */
3873 p->utime += cputime; 2639 p->utime += cputime;
3874 p->utimescaled += cputime_scaled; 2640 p->utimescaled += cputime_scaled;
3875 account_group_user_time(p, cputime); 2641 account_group_user_time(p, cputime);
3876 2642
2643 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2644
3877 /* Add user time to cpustat. */ 2645 /* Add user time to cpustat. */
3878 if (TASK_NICE(p) > 0) 2646 task_group_account_field(p, index, (__force u64) cputime);
3879 cpustat->nice += (__force cputime64_t) cputime;
3880 else
3881 cpustat->user += (__force cputime64_t) cputime;
3882 2647
3883 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3884 /* Account for user time used */ 2648 /* Account for user time used */
3885 acct_update_integrals(p); 2649 acct_update_integrals(p);
3886} 2650}
@@ -3894,7 +2658,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
3894static void account_guest_time(struct task_struct *p, cputime_t cputime, 2658static void account_guest_time(struct task_struct *p, cputime_t cputime,
3895 cputime_t cputime_scaled) 2659 cputime_t cputime_scaled)
3896{ 2660{
3897 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2661 u64 *cpustat = kcpustat_this_cpu->cpustat;
3898 2662
3899 /* Add guest time to process. */ 2663 /* Add guest time to process. */
3900 p->utime += cputime; 2664 p->utime += cputime;
@@ -3904,11 +2668,11 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3904 2668
3905 /* Add guest time to cpustat. */ 2669 /* Add guest time to cpustat. */
3906 if (TASK_NICE(p) > 0) { 2670 if (TASK_NICE(p) > 0) {
3907 cpustat->nice += (__force cputime64_t) cputime; 2671 cpustat[CPUTIME_NICE] += (__force u64) cputime;
3908 cpustat->guest_nice += (__force cputime64_t) cputime; 2672 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
3909 } else { 2673 } else {
3910 cpustat->user += (__force cputime64_t) cputime; 2674 cpustat[CPUTIME_USER] += (__force u64) cputime;
3911 cpustat->guest += (__force cputime64_t) cputime; 2675 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
3912 } 2676 }
3913} 2677}
3914 2678
@@ -3921,7 +2685,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3921 */ 2685 */
3922static inline 2686static inline
3923void __account_system_time(struct task_struct *p, cputime_t cputime, 2687void __account_system_time(struct task_struct *p, cputime_t cputime,
3924 cputime_t cputime_scaled, cputime64_t *target_cputime64) 2688 cputime_t cputime_scaled, int index)
3925{ 2689{
3926 /* Add system time to process. */ 2690 /* Add system time to process. */
3927 p->stime += cputime; 2691 p->stime += cputime;
@@ -3929,8 +2693,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
3929 account_group_system_time(p, cputime); 2693 account_group_system_time(p, cputime);
3930 2694
3931 /* Add system time to cpustat. */ 2695 /* Add system time to cpustat. */
3932 *target_cputime64 += (__force cputime64_t) cputime; 2696 task_group_account_field(p, index, (__force u64) cputime);
3933 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3934 2697
3935 /* Account for system time used */ 2698 /* Account for system time used */
3936 acct_update_integrals(p); 2699 acct_update_integrals(p);
@@ -3946,8 +2709,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
3946void account_system_time(struct task_struct *p, int hardirq_offset, 2709void account_system_time(struct task_struct *p, int hardirq_offset,
3947 cputime_t cputime, cputime_t cputime_scaled) 2710 cputime_t cputime, cputime_t cputime_scaled)
3948{ 2711{
3949 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2712 int index;
3950 cputime64_t *target_cputime64;
3951 2713
3952 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 2714 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3953 account_guest_time(p, cputime, cputime_scaled); 2715 account_guest_time(p, cputime, cputime_scaled);
@@ -3955,13 +2717,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3955 } 2717 }
3956 2718
3957 if (hardirq_count() - hardirq_offset) 2719 if (hardirq_count() - hardirq_offset)
3958 target_cputime64 = &cpustat->irq; 2720 index = CPUTIME_IRQ;
3959 else if (in_serving_softirq()) 2721 else if (in_serving_softirq())
3960 target_cputime64 = &cpustat->softirq; 2722 index = CPUTIME_SOFTIRQ;
3961 else 2723 else
3962 target_cputime64 = &cpustat->system; 2724 index = CPUTIME_SYSTEM;
3963 2725
3964 __account_system_time(p, cputime, cputime_scaled, target_cputime64); 2726 __account_system_time(p, cputime, cputime_scaled, index);
3965} 2727}
3966 2728
3967/* 2729/*
@@ -3970,9 +2732,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3970 */ 2732 */
3971void account_steal_time(cputime_t cputime) 2733void account_steal_time(cputime_t cputime)
3972{ 2734{
3973 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2735 u64 *cpustat = kcpustat_this_cpu->cpustat;
3974 2736
3975 cpustat->steal += (__force cputime64_t) cputime; 2737 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
3976} 2738}
3977 2739
3978/* 2740/*
@@ -3981,13 +2743,13 @@ void account_steal_time(cputime_t cputime)
3981 */ 2743 */
3982void account_idle_time(cputime_t cputime) 2744void account_idle_time(cputime_t cputime)
3983{ 2745{
3984 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2746 u64 *cpustat = kcpustat_this_cpu->cpustat;
3985 struct rq *rq = this_rq(); 2747 struct rq *rq = this_rq();
3986 2748
3987 if (atomic_read(&rq->nr_iowait) > 0) 2749 if (atomic_read(&rq->nr_iowait) > 0)
3988 cpustat->iowait += (__force cputime64_t) cputime; 2750 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
3989 else 2751 else
3990 cpustat->idle += (__force cputime64_t) cputime; 2752 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
3991} 2753}
3992 2754
3993static __always_inline bool steal_account_process_tick(void) 2755static __always_inline bool steal_account_process_tick(void)
@@ -4037,15 +2799,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4037 struct rq *rq) 2799 struct rq *rq)
4038{ 2800{
4039 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 2801 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
4040 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2802 u64 *cpustat = kcpustat_this_cpu->cpustat;
4041 2803
4042 if (steal_account_process_tick()) 2804 if (steal_account_process_tick())
4043 return; 2805 return;
4044 2806
4045 if (irqtime_account_hi_update()) { 2807 if (irqtime_account_hi_update()) {
4046 cpustat->irq += (__force cputime64_t) cputime_one_jiffy; 2808 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
4047 } else if (irqtime_account_si_update()) { 2809 } else if (irqtime_account_si_update()) {
4048 cpustat->softirq += (__force cputime64_t) cputime_one_jiffy; 2810 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
4049 } else if (this_cpu_ksoftirqd() == p) { 2811 } else if (this_cpu_ksoftirqd() == p) {
4050 /* 2812 /*
4051 * ksoftirqd time do not get accounted in cpu_softirq_time. 2813 * ksoftirqd time do not get accounted in cpu_softirq_time.
@@ -4053,7 +2815,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4053 * Also, p->stime needs to be updated for ksoftirqd. 2815 * Also, p->stime needs to be updated for ksoftirqd.
4054 */ 2816 */
4055 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 2817 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
4056 &cpustat->softirq); 2818 CPUTIME_SOFTIRQ);
4057 } else if (user_tick) { 2819 } else if (user_tick) {
4058 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 2820 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
4059 } else if (p == rq->idle) { 2821 } else if (p == rq->idle) {
@@ -4062,7 +2824,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4062 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); 2824 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
4063 } else { 2825 } else {
4064 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 2826 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
4065 &cpustat->system); 2827 CPUTIME_SYSTEM);
4066 } 2828 }
4067} 2829}
4068 2830
@@ -5841,6 +4603,13 @@ again:
5841 */ 4603 */
5842 if (preempt && rq != p_rq) 4604 if (preempt && rq != p_rq)
5843 resched_task(p_rq->curr); 4605 resched_task(p_rq->curr);
4606 } else {
4607 /*
4608 * We might have set it in task_yield_fair(), but are
4609 * not going to schedule(), so don't want to skip
4610 * the next update.
4611 */
4612 rq->skip_clock_update = 0;
5844 } 4613 }
5845 4614
5846out: 4615out:
@@ -6008,7 +4777,7 @@ void sched_show_task(struct task_struct *p)
6008 free = stack_not_used(p); 4777 free = stack_not_used(p);
6009#endif 4778#endif
6010 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4779 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6011 task_pid_nr(p), task_pid_nr(p->real_parent), 4780 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
6012 (unsigned long)task_thread_info(p)->flags); 4781 (unsigned long)task_thread_info(p)->flags);
6013 4782
6014 show_stack(p, NULL); 4783 show_stack(p, NULL);
@@ -6107,53 +4876,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6107#endif 4876#endif
6108} 4877}
6109 4878
6110/*
6111 * Increase the granularity value when there are more CPUs,
6112 * because with more CPUs the 'effective latency' as visible
6113 * to users decreases. But the relationship is not linear,
6114 * so pick a second-best guess by going with the log2 of the
6115 * number of CPUs.
6116 *
6117 * This idea comes from the SD scheduler of Con Kolivas:
6118 */
6119static int get_update_sysctl_factor(void)
6120{
6121 unsigned int cpus = min_t(int, num_online_cpus(), 8);
6122 unsigned int factor;
6123
6124 switch (sysctl_sched_tunable_scaling) {
6125 case SCHED_TUNABLESCALING_NONE:
6126 factor = 1;
6127 break;
6128 case SCHED_TUNABLESCALING_LINEAR:
6129 factor = cpus;
6130 break;
6131 case SCHED_TUNABLESCALING_LOG:
6132 default:
6133 factor = 1 + ilog2(cpus);
6134 break;
6135 }
6136
6137 return factor;
6138}
6139
6140static void update_sysctl(void)
6141{
6142 unsigned int factor = get_update_sysctl_factor();
6143
6144#define SET_SYSCTL(name) \
6145 (sysctl_##name = (factor) * normalized_sysctl_##name)
6146 SET_SYSCTL(sched_min_granularity);
6147 SET_SYSCTL(sched_latency);
6148 SET_SYSCTL(sched_wakeup_granularity);
6149#undef SET_SYSCTL
6150}
6151
6152static inline void sched_init_granularity(void)
6153{
6154 update_sysctl();
6155}
6156
6157#ifdef CONFIG_SMP 4879#ifdef CONFIG_SMP
6158void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4880void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6159{ 4881{
@@ -6340,30 +5062,6 @@ static void calc_global_load_remove(struct rq *rq)
6340 rq->calc_load_active = 0; 5062 rq->calc_load_active = 0;
6341} 5063}
6342 5064
6343#ifdef CONFIG_CFS_BANDWIDTH
6344static void unthrottle_offline_cfs_rqs(struct rq *rq)
6345{
6346 struct cfs_rq *cfs_rq;
6347
6348 for_each_leaf_cfs_rq(rq, cfs_rq) {
6349 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6350
6351 if (!cfs_rq->runtime_enabled)
6352 continue;
6353
6354 /*
6355 * clock_task is not advancing so we just need to make sure
6356 * there's some valid quota amount
6357 */
6358 cfs_rq->runtime_remaining = cfs_b->quota;
6359 if (cfs_rq_throttled(cfs_rq))
6360 unthrottle_cfs_rq(cfs_rq);
6361 }
6362}
6363#else
6364static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6365#endif
6366
6367/* 5065/*
6368 * Migrate all tasks from the rq, sleeping tasks will be migrated by 5066 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6369 * try_to_wake_up()->select_task_rq(). 5067 * try_to_wake_up()->select_task_rq().
@@ -6969,6 +5667,12 @@ out:
6969 return -ENOMEM; 5667 return -ENOMEM;
6970} 5668}
6971 5669
5670/*
5671 * By default the system creates a single root-domain with all cpus as
5672 * members (mimicking the global state we have today).
5673 */
5674struct root_domain def_root_domain;
5675
6972static void init_defrootdomain(void) 5676static void init_defrootdomain(void)
6973{ 5677{
6974 init_rootdomain(&def_root_domain); 5678 init_rootdomain(&def_root_domain);
@@ -7237,7 +5941,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
7237 continue; 5941 continue;
7238 5942
7239 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5943 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7240 GFP_KERNEL, cpu_to_node(i)); 5944 GFP_KERNEL, cpu_to_node(cpu));
7241 5945
7242 if (!sg) 5946 if (!sg)
7243 goto fail; 5947 goto fail;
@@ -7375,6 +6079,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7375 return; 6079 return;
7376 6080
7377 update_group_power(sd, cpu); 6081 update_group_power(sd, cpu);
6082 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
6083}
6084
6085int __weak arch_sd_sibling_asym_packing(void)
6086{
6087 return 0*SD_ASYM_PACKING;
7378} 6088}
7379 6089
7380/* 6090/*
@@ -8012,29 +6722,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
8012 } 6722 }
8013} 6723}
8014 6724
8015static int update_runtime(struct notifier_block *nfb,
8016 unsigned long action, void *hcpu)
8017{
8018 int cpu = (int)(long)hcpu;
8019
8020 switch (action) {
8021 case CPU_DOWN_PREPARE:
8022 case CPU_DOWN_PREPARE_FROZEN:
8023 disable_runtime(cpu_rq(cpu));
8024 return NOTIFY_OK;
8025
8026 case CPU_DOWN_FAILED:
8027 case CPU_DOWN_FAILED_FROZEN:
8028 case CPU_ONLINE:
8029 case CPU_ONLINE_FROZEN:
8030 enable_runtime(cpu_rq(cpu));
8031 return NOTIFY_OK;
8032
8033 default:
8034 return NOTIFY_DONE;
8035 }
8036}
8037
8038void __init sched_init_smp(void) 6725void __init sched_init_smp(void)
8039{ 6726{
8040 cpumask_var_t non_isolated_cpus; 6727 cpumask_var_t non_isolated_cpus;
@@ -8083,104 +6770,11 @@ int in_sched_functions(unsigned long addr)
8083 && addr < (unsigned long)__sched_text_end); 6770 && addr < (unsigned long)__sched_text_end);
8084} 6771}
8085 6772
8086static void init_cfs_rq(struct cfs_rq *cfs_rq) 6773#ifdef CONFIG_CGROUP_SCHED
8087{ 6774struct task_group root_task_group;
8088 cfs_rq->tasks_timeline = RB_ROOT;
8089 INIT_LIST_HEAD(&cfs_rq->tasks);
8090 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8091#ifndef CONFIG_64BIT
8092 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8093#endif
8094}
8095
8096static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8097{
8098 struct rt_prio_array *array;
8099 int i;
8100
8101 array = &rt_rq->active;
8102 for (i = 0; i < MAX_RT_PRIO; i++) {
8103 INIT_LIST_HEAD(array->queue + i);
8104 __clear_bit(i, array->bitmap);
8105 }
8106 /* delimiter for bitsearch: */
8107 __set_bit(MAX_RT_PRIO, array->bitmap);
8108
8109#if defined CONFIG_SMP
8110 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8111 rt_rq->highest_prio.next = MAX_RT_PRIO;
8112 rt_rq->rt_nr_migratory = 0;
8113 rt_rq->overloaded = 0;
8114 plist_head_init(&rt_rq->pushable_tasks);
8115#endif
8116
8117 rt_rq->rt_time = 0;
8118 rt_rq->rt_throttled = 0;
8119 rt_rq->rt_runtime = 0;
8120 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
8121}
8122
8123#ifdef CONFIG_FAIR_GROUP_SCHED
8124static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8125 struct sched_entity *se, int cpu,
8126 struct sched_entity *parent)
8127{
8128 struct rq *rq = cpu_rq(cpu);
8129
8130 cfs_rq->tg = tg;
8131 cfs_rq->rq = rq;
8132#ifdef CONFIG_SMP
8133 /* allow initial update_cfs_load() to truncate */
8134 cfs_rq->load_stamp = 1;
8135#endif
8136 init_cfs_rq_runtime(cfs_rq);
8137
8138 tg->cfs_rq[cpu] = cfs_rq;
8139 tg->se[cpu] = se;
8140
8141 /* se could be NULL for root_task_group */
8142 if (!se)
8143 return;
8144
8145 if (!parent)
8146 se->cfs_rq = &rq->cfs;
8147 else
8148 se->cfs_rq = parent->my_q;
8149
8150 se->my_q = cfs_rq;
8151 update_load_set(&se->load, 0);
8152 se->parent = parent;
8153}
8154#endif 6775#endif
8155 6776
8156#ifdef CONFIG_RT_GROUP_SCHED 6777DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
8157static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8158 struct sched_rt_entity *rt_se, int cpu,
8159 struct sched_rt_entity *parent)
8160{
8161 struct rq *rq = cpu_rq(cpu);
8162
8163 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8164 rt_rq->rt_nr_boosted = 0;
8165 rt_rq->rq = rq;
8166 rt_rq->tg = tg;
8167
8168 tg->rt_rq[cpu] = rt_rq;
8169 tg->rt_se[cpu] = rt_se;
8170
8171 if (!rt_se)
8172 return;
8173
8174 if (!parent)
8175 rt_se->rt_rq = &rq->rt;
8176 else
8177 rt_se->rt_rq = parent->my_q;
8178
8179 rt_se->my_q = rt_rq;
8180 rt_se->parent = parent;
8181 INIT_LIST_HEAD(&rt_se->run_list);
8182}
8183#endif
8184 6778
8185void __init sched_init(void) 6779void __init sched_init(void)
8186{ 6780{
@@ -8238,9 +6832,17 @@ void __init sched_init(void)
8238#ifdef CONFIG_CGROUP_SCHED 6832#ifdef CONFIG_CGROUP_SCHED
8239 list_add(&root_task_group.list, &task_groups); 6833 list_add(&root_task_group.list, &task_groups);
8240 INIT_LIST_HEAD(&root_task_group.children); 6834 INIT_LIST_HEAD(&root_task_group.children);
6835 INIT_LIST_HEAD(&root_task_group.siblings);
8241 autogroup_init(&init_task); 6836 autogroup_init(&init_task);
6837
8242#endif /* CONFIG_CGROUP_SCHED */ 6838#endif /* CONFIG_CGROUP_SCHED */
8243 6839
6840#ifdef CONFIG_CGROUP_CPUACCT
6841 root_cpuacct.cpustat = &kernel_cpustat;
6842 root_cpuacct.cpuusage = alloc_percpu(u64);
6843 /* Too early, not expected to fail */
6844 BUG_ON(!root_cpuacct.cpuusage);
6845#endif
8244 for_each_possible_cpu(i) { 6846 for_each_possible_cpu(i) {
8245 struct rq *rq; 6847 struct rq *rq;
8246 6848
@@ -8252,7 +6854,7 @@ void __init sched_init(void)
8252 init_cfs_rq(&rq->cfs); 6854 init_cfs_rq(&rq->cfs);
8253 init_rt_rq(&rq->rt, rq); 6855 init_rt_rq(&rq->rt, rq);
8254#ifdef CONFIG_FAIR_GROUP_SCHED 6856#ifdef CONFIG_FAIR_GROUP_SCHED
8255 root_task_group.shares = root_task_group_load; 6857 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
8256 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6858 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8257 /* 6859 /*
8258 * How much cpu bandwidth does root_task_group get? 6860 * How much cpu bandwidth does root_task_group get?
@@ -8302,7 +6904,7 @@ void __init sched_init(void)
8302 rq->avg_idle = 2*sysctl_sched_migration_cost; 6904 rq->avg_idle = 2*sysctl_sched_migration_cost;
8303 rq_attach_root(rq, &def_root_domain); 6905 rq_attach_root(rq, &def_root_domain);
8304#ifdef CONFIG_NO_HZ 6906#ifdef CONFIG_NO_HZ
8305 rq->nohz_balance_kick = 0; 6907 rq->nohz_flags = 0;
8306#endif 6908#endif
8307#endif 6909#endif
8308 init_rq_hrtick(rq); 6910 init_rq_hrtick(rq);
@@ -8315,10 +6917,6 @@ void __init sched_init(void)
8315 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6917 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8316#endif 6918#endif
8317 6919
8318#ifdef CONFIG_SMP
8319 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8320#endif
8321
8322#ifdef CONFIG_RT_MUTEXES 6920#ifdef CONFIG_RT_MUTEXES
8323 plist_head_init(&init_task.pi_waiters); 6921 plist_head_init(&init_task.pi_waiters);
8324#endif 6922#endif
@@ -8346,17 +6944,11 @@ void __init sched_init(void)
8346 6944
8347#ifdef CONFIG_SMP 6945#ifdef CONFIG_SMP
8348 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6946 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8349#ifdef CONFIG_NO_HZ
8350 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8351 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8352 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8353 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8354 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
8355#endif
8356 /* May be allocated at isolcpus cmdline parse time */ 6947 /* May be allocated at isolcpus cmdline parse time */
8357 if (cpu_isolated_map == NULL) 6948 if (cpu_isolated_map == NULL)
8358 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6949 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8359#endif /* SMP */ 6950#endif
6951 init_sched_fair_class();
8360 6952
8361 scheduler_running = 1; 6953 scheduler_running = 1;
8362} 6954}
@@ -8508,169 +7100,14 @@ void set_curr_task(int cpu, struct task_struct *p)
8508 7100
8509#endif 7101#endif
8510 7102
8511#ifdef CONFIG_FAIR_GROUP_SCHED
8512static void free_fair_sched_group(struct task_group *tg)
8513{
8514 int i;
8515
8516 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8517
8518 for_each_possible_cpu(i) {
8519 if (tg->cfs_rq)
8520 kfree(tg->cfs_rq[i]);
8521 if (tg->se)
8522 kfree(tg->se[i]);
8523 }
8524
8525 kfree(tg->cfs_rq);
8526 kfree(tg->se);
8527}
8528
8529static
8530int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8531{
8532 struct cfs_rq *cfs_rq;
8533 struct sched_entity *se;
8534 int i;
8535
8536 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8537 if (!tg->cfs_rq)
8538 goto err;
8539 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8540 if (!tg->se)
8541 goto err;
8542
8543 tg->shares = NICE_0_LOAD;
8544
8545 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8546
8547 for_each_possible_cpu(i) {
8548 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8549 GFP_KERNEL, cpu_to_node(i));
8550 if (!cfs_rq)
8551 goto err;
8552
8553 se = kzalloc_node(sizeof(struct sched_entity),
8554 GFP_KERNEL, cpu_to_node(i));
8555 if (!se)
8556 goto err_free_rq;
8557
8558 init_cfs_rq(cfs_rq);
8559 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8560 }
8561
8562 return 1;
8563
8564err_free_rq:
8565 kfree(cfs_rq);
8566err:
8567 return 0;
8568}
8569
8570static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8571{
8572 struct rq *rq = cpu_rq(cpu);
8573 unsigned long flags;
8574
8575 /*
8576 * Only empty task groups can be destroyed; so we can speculatively
8577 * check on_list without danger of it being re-added.
8578 */
8579 if (!tg->cfs_rq[cpu]->on_list)
8580 return;
8581
8582 raw_spin_lock_irqsave(&rq->lock, flags);
8583 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8584 raw_spin_unlock_irqrestore(&rq->lock, flags);
8585}
8586#else /* !CONFIG_FAIR_GROUP_SCHED */
8587static inline void free_fair_sched_group(struct task_group *tg)
8588{
8589}
8590
8591static inline
8592int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8593{
8594 return 1;
8595}
8596
8597static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8598{
8599}
8600#endif /* CONFIG_FAIR_GROUP_SCHED */
8601
8602#ifdef CONFIG_RT_GROUP_SCHED 7103#ifdef CONFIG_RT_GROUP_SCHED
8603static void free_rt_sched_group(struct task_group *tg)
8604{
8605 int i;
8606
8607 if (tg->rt_se)
8608 destroy_rt_bandwidth(&tg->rt_bandwidth);
8609
8610 for_each_possible_cpu(i) {
8611 if (tg->rt_rq)
8612 kfree(tg->rt_rq[i]);
8613 if (tg->rt_se)
8614 kfree(tg->rt_se[i]);
8615 }
8616
8617 kfree(tg->rt_rq);
8618 kfree(tg->rt_se);
8619}
8620
8621static
8622int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8623{
8624 struct rt_rq *rt_rq;
8625 struct sched_rt_entity *rt_se;
8626 int i;
8627
8628 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8629 if (!tg->rt_rq)
8630 goto err;
8631 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8632 if (!tg->rt_se)
8633 goto err;
8634
8635 init_rt_bandwidth(&tg->rt_bandwidth,
8636 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8637
8638 for_each_possible_cpu(i) {
8639 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8640 GFP_KERNEL, cpu_to_node(i));
8641 if (!rt_rq)
8642 goto err;
8643
8644 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8645 GFP_KERNEL, cpu_to_node(i));
8646 if (!rt_se)
8647 goto err_free_rq;
8648
8649 init_rt_rq(rt_rq, cpu_rq(i));
8650 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8651 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8652 }
8653
8654 return 1;
8655
8656err_free_rq:
8657 kfree(rt_rq);
8658err:
8659 return 0;
8660}
8661#else /* !CONFIG_RT_GROUP_SCHED */ 7104#else /* !CONFIG_RT_GROUP_SCHED */
8662static inline void free_rt_sched_group(struct task_group *tg)
8663{
8664}
8665
8666static inline
8667int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8668{
8669 return 1;
8670}
8671#endif /* CONFIG_RT_GROUP_SCHED */ 7105#endif /* CONFIG_RT_GROUP_SCHED */
8672 7106
8673#ifdef CONFIG_CGROUP_SCHED 7107#ifdef CONFIG_CGROUP_SCHED
7108/* task_group_lock serializes the addition/removal of task groups */
7109static DEFINE_SPINLOCK(task_group_lock);
7110
8674static void free_sched_group(struct task_group *tg) 7111static void free_sched_group(struct task_group *tg)
8675{ 7112{
8676 free_fair_sched_group(tg); 7113 free_fair_sched_group(tg);
@@ -8776,47 +7213,6 @@ void sched_move_task(struct task_struct *tsk)
8776#endif /* CONFIG_CGROUP_SCHED */ 7213#endif /* CONFIG_CGROUP_SCHED */
8777 7214
8778#ifdef CONFIG_FAIR_GROUP_SCHED 7215#ifdef CONFIG_FAIR_GROUP_SCHED
8779static DEFINE_MUTEX(shares_mutex);
8780
8781int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8782{
8783 int i;
8784 unsigned long flags;
8785
8786 /*
8787 * We can't change the weight of the root cgroup.
8788 */
8789 if (!tg->se[0])
8790 return -EINVAL;
8791
8792 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8793
8794 mutex_lock(&shares_mutex);
8795 if (tg->shares == shares)
8796 goto done;
8797
8798 tg->shares = shares;
8799 for_each_possible_cpu(i) {
8800 struct rq *rq = cpu_rq(i);
8801 struct sched_entity *se;
8802
8803 se = tg->se[i];
8804 /* Propagate contribution to hierarchy */
8805 raw_spin_lock_irqsave(&rq->lock, flags);
8806 for_each_sched_entity(se)
8807 update_cfs_shares(group_cfs_rq(se));
8808 raw_spin_unlock_irqrestore(&rq->lock, flags);
8809 }
8810
8811done:
8812 mutex_unlock(&shares_mutex);
8813 return 0;
8814}
8815
8816unsigned long sched_group_shares(struct task_group *tg)
8817{
8818 return tg->shares;
8819}
8820#endif 7216#endif
8821 7217
8822#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) 7218#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
@@ -8841,7 +7237,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
8841 struct task_struct *g, *p; 7237 struct task_struct *g, *p;
8842 7238
8843 do_each_thread(g, p) { 7239 do_each_thread(g, p) {
8844 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 7240 if (rt_task(p) && task_rq(p)->rt.tg == tg)
8845 return 1; 7241 return 1;
8846 } while_each_thread(g, p); 7242 } while_each_thread(g, p);
8847 7243
@@ -9192,8 +7588,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9192 7588
9193static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7589static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9194{ 7590{
9195 int i, ret = 0, runtime_enabled; 7591 int i, ret = 0, runtime_enabled, runtime_was_enabled;
9196 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7592 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9197 7593
9198 if (tg == &root_task_group) 7594 if (tg == &root_task_group)
9199 return -EINVAL; 7595 return -EINVAL;
@@ -9220,6 +7616,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9220 goto out_unlock; 7616 goto out_unlock;
9221 7617
9222 runtime_enabled = quota != RUNTIME_INF; 7618 runtime_enabled = quota != RUNTIME_INF;
7619 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7620 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
9223 raw_spin_lock_irq(&cfs_b->lock); 7621 raw_spin_lock_irq(&cfs_b->lock);
9224 cfs_b->period = ns_to_ktime(period); 7622 cfs_b->period = ns_to_ktime(period);
9225 cfs_b->quota = quota; 7623 cfs_b->quota = quota;
@@ -9235,13 +7633,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9235 7633
9236 for_each_possible_cpu(i) { 7634 for_each_possible_cpu(i) {
9237 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7635 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9238 struct rq *rq = rq_of(cfs_rq); 7636 struct rq *rq = cfs_rq->rq;
9239 7637
9240 raw_spin_lock_irq(&rq->lock); 7638 raw_spin_lock_irq(&rq->lock);
9241 cfs_rq->runtime_enabled = runtime_enabled; 7639 cfs_rq->runtime_enabled = runtime_enabled;
9242 cfs_rq->runtime_remaining = 0; 7640 cfs_rq->runtime_remaining = 0;
9243 7641
9244 if (cfs_rq_throttled(cfs_rq)) 7642 if (cfs_rq->throttled)
9245 unthrottle_cfs_rq(cfs_rq); 7643 unthrottle_cfs_rq(cfs_rq);
9246 raw_spin_unlock_irq(&rq->lock); 7644 raw_spin_unlock_irq(&rq->lock);
9247 } 7645 }
@@ -9255,7 +7653,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9255{ 7653{
9256 u64 quota, period; 7654 u64 quota, period;
9257 7655
9258 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 7656 period = ktime_to_ns(tg->cfs_bandwidth.period);
9259 if (cfs_quota_us < 0) 7657 if (cfs_quota_us < 0)
9260 quota = RUNTIME_INF; 7658 quota = RUNTIME_INF;
9261 else 7659 else
@@ -9268,10 +7666,10 @@ long tg_get_cfs_quota(struct task_group *tg)
9268{ 7666{
9269 u64 quota_us; 7667 u64 quota_us;
9270 7668
9271 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) 7669 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
9272 return -1; 7670 return -1;
9273 7671
9274 quota_us = tg_cfs_bandwidth(tg)->quota; 7672 quota_us = tg->cfs_bandwidth.quota;
9275 do_div(quota_us, NSEC_PER_USEC); 7673 do_div(quota_us, NSEC_PER_USEC);
9276 7674
9277 return quota_us; 7675 return quota_us;
@@ -9282,7 +7680,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9282 u64 quota, period; 7680 u64 quota, period;
9283 7681
9284 period = (u64)cfs_period_us * NSEC_PER_USEC; 7682 period = (u64)cfs_period_us * NSEC_PER_USEC;
9285 quota = tg_cfs_bandwidth(tg)->quota; 7683 quota = tg->cfs_bandwidth.quota;
9286 7684
9287 if (period <= 0) 7685 if (period <= 0)
9288 return -EINVAL; 7686 return -EINVAL;
@@ -9294,7 +7692,7 @@ long tg_get_cfs_period(struct task_group *tg)
9294{ 7692{
9295 u64 cfs_period_us; 7693 u64 cfs_period_us;
9296 7694
9297 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 7695 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
9298 do_div(cfs_period_us, NSEC_PER_USEC); 7696 do_div(cfs_period_us, NSEC_PER_USEC);
9299 7697
9300 return cfs_period_us; 7698 return cfs_period_us;
@@ -9354,13 +7752,13 @@ static u64 normalize_cfs_quota(struct task_group *tg,
9354static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7752static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9355{ 7753{
9356 struct cfs_schedulable_data *d = data; 7754 struct cfs_schedulable_data *d = data;
9357 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7755 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9358 s64 quota = 0, parent_quota = -1; 7756 s64 quota = 0, parent_quota = -1;
9359 7757
9360 if (!tg->parent) { 7758 if (!tg->parent) {
9361 quota = RUNTIME_INF; 7759 quota = RUNTIME_INF;
9362 } else { 7760 } else {
9363 struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); 7761 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
9364 7762
9365 quota = normalize_cfs_quota(tg, d); 7763 quota = normalize_cfs_quota(tg, d);
9366 parent_quota = parent_b->hierarchal_quota; 7764 parent_quota = parent_b->hierarchal_quota;
@@ -9404,7 +7802,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9404 struct cgroup_map_cb *cb) 7802 struct cgroup_map_cb *cb)
9405{ 7803{
9406 struct task_group *tg = cgroup_tg(cgrp); 7804 struct task_group *tg = cgroup_tg(cgrp);
9407 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7805 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9408 7806
9409 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7807 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9410 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7808 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
@@ -9505,38 +7903,16 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9505 * (balbir@in.ibm.com). 7903 * (balbir@in.ibm.com).
9506 */ 7904 */
9507 7905
9508/* track cpu usage of a group of tasks and its child groups */
9509struct cpuacct {
9510 struct cgroup_subsys_state css;
9511 /* cpuusage holds pointer to a u64-type object on every cpu */
9512 u64 __percpu *cpuusage;
9513 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
9514 struct cpuacct *parent;
9515};
9516
9517struct cgroup_subsys cpuacct_subsys;
9518
9519/* return cpu accounting group corresponding to this container */
9520static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
9521{
9522 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
9523 struct cpuacct, css);
9524}
9525
9526/* return cpu accounting group to which this task belongs */
9527static inline struct cpuacct *task_ca(struct task_struct *tsk)
9528{
9529 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9530 struct cpuacct, css);
9531}
9532
9533/* create a new cpu accounting group */ 7906/* create a new cpu accounting group */
9534static struct cgroup_subsys_state *cpuacct_create( 7907static struct cgroup_subsys_state *cpuacct_create(
9535 struct cgroup_subsys *ss, struct cgroup *cgrp) 7908 struct cgroup_subsys *ss, struct cgroup *cgrp)
9536{ 7909{
9537 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 7910 struct cpuacct *ca;
9538 int i; 7911
7912 if (!cgrp->parent)
7913 return &root_cpuacct.css;
9539 7914
7915 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
9540 if (!ca) 7916 if (!ca)
9541 goto out; 7917 goto out;
9542 7918
@@ -9544,18 +7920,13 @@ static struct cgroup_subsys_state *cpuacct_create(
9544 if (!ca->cpuusage) 7920 if (!ca->cpuusage)
9545 goto out_free_ca; 7921 goto out_free_ca;
9546 7922
9547 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 7923 ca->cpustat = alloc_percpu(struct kernel_cpustat);
9548 if (percpu_counter_init(&ca->cpustat[i], 0)) 7924 if (!ca->cpustat)
9549 goto out_free_counters; 7925 goto out_free_cpuusage;
9550
9551 if (cgrp->parent)
9552 ca->parent = cgroup_ca(cgrp->parent);
9553 7926
9554 return &ca->css; 7927 return &ca->css;
9555 7928
9556out_free_counters: 7929out_free_cpuusage:
9557 while (--i >= 0)
9558 percpu_counter_destroy(&ca->cpustat[i]);
9559 free_percpu(ca->cpuusage); 7930 free_percpu(ca->cpuusage);
9560out_free_ca: 7931out_free_ca:
9561 kfree(ca); 7932 kfree(ca);
@@ -9568,10 +7939,8 @@ static void
9568cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 7939cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9569{ 7940{
9570 struct cpuacct *ca = cgroup_ca(cgrp); 7941 struct cpuacct *ca = cgroup_ca(cgrp);
9571 int i;
9572 7942
9573 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 7943 free_percpu(ca->cpustat);
9574 percpu_counter_destroy(&ca->cpustat[i]);
9575 free_percpu(ca->cpuusage); 7944 free_percpu(ca->cpuusage);
9576 kfree(ca); 7945 kfree(ca);
9577} 7946}
@@ -9664,16 +8033,31 @@ static const char *cpuacct_stat_desc[] = {
9664}; 8033};
9665 8034
9666static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 8035static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
9667 struct cgroup_map_cb *cb) 8036 struct cgroup_map_cb *cb)
9668{ 8037{
9669 struct cpuacct *ca = cgroup_ca(cgrp); 8038 struct cpuacct *ca = cgroup_ca(cgrp);
9670 int i; 8039 int cpu;
8040 s64 val = 0;
8041
8042 for_each_online_cpu(cpu) {
8043 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8044 val += kcpustat->cpustat[CPUTIME_USER];
8045 val += kcpustat->cpustat[CPUTIME_NICE];
8046 }
8047 val = cputime64_to_clock_t(val);
8048 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
9671 8049
9672 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { 8050 val = 0;
9673 s64 val = percpu_counter_read(&ca->cpustat[i]); 8051 for_each_online_cpu(cpu) {
9674 val = cputime64_to_clock_t(val); 8052 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
9675 cb->fill(cb, cpuacct_stat_desc[i], val); 8053 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8054 val += kcpustat->cpustat[CPUTIME_IRQ];
8055 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
9676 } 8056 }
8057
8058 val = cputime64_to_clock_t(val);
8059 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8060
9677 return 0; 8061 return 0;
9678} 8062}
9679 8063
@@ -9703,7 +8087,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9703 * 8087 *
9704 * called with rq->lock held. 8088 * called with rq->lock held.
9705 */ 8089 */
9706static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 8090void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9707{ 8091{
9708 struct cpuacct *ca; 8092 struct cpuacct *ca;
9709 int cpu; 8093 int cpu;
@@ -9717,7 +8101,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9717 8101
9718 ca = task_ca(tsk); 8102 ca = task_ca(tsk);
9719 8103
9720 for (; ca; ca = ca->parent) { 8104 for (; ca; ca = parent_ca(ca)) {
9721 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8105 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9722 *cpuusage += cputime; 8106 *cpuusage += cputime;
9723 } 8107 }
@@ -9725,46 +8109,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9725 rcu_read_unlock(); 8109 rcu_read_unlock();
9726} 8110}
9727 8111
9728/*
9729 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9730 * in cputime_t units. As a result, cpuacct_update_stats calls
9731 * percpu_counter_add with values large enough to always overflow the
9732 * per cpu batch limit causing bad SMP scalability.
9733 *
9734 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9735 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9736 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9737 */
9738#ifdef CONFIG_SMP
9739#define CPUACCT_BATCH \
9740 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9741#else
9742#define CPUACCT_BATCH 0
9743#endif
9744
9745/*
9746 * Charge the system/user time to the task's accounting group.
9747 */
9748static void cpuacct_update_stats(struct task_struct *tsk,
9749 enum cpuacct_stat_index idx, cputime_t val)
9750{
9751 struct cpuacct *ca;
9752 int batch = CPUACCT_BATCH;
9753
9754 if (unlikely(!cpuacct_subsys.active))
9755 return;
9756
9757 rcu_read_lock();
9758 ca = task_ca(tsk);
9759
9760 do {
9761 __percpu_counter_add(&ca->cpustat[idx],
9762 (__force s64) val, batch);
9763 ca = ca->parent;
9764 } while (ca);
9765 rcu_read_unlock();
9766}
9767
9768struct cgroup_subsys cpuacct_subsys = { 8112struct cgroup_subsys cpuacct_subsys = {
9769 .name = "cpuacct", 8113 .name = "cpuacct",
9770 .create = cpuacct_create, 8114 .create = cpuacct_create,
diff --git a/kernel/sched_cpupri.c b/kernel/sched/cpupri.c
index a86cf9d9eb1..b0d798eaf13 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/sched_cpupri.c 2 * kernel/sched/cpupri.c
3 * 3 *
4 * CPU priority management 4 * CPU priority management
5 * 5 *
@@ -28,7 +28,7 @@
28 */ 28 */
29 29
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include "sched_cpupri.h" 31#include "cpupri.h"
32 32
33/* Convert between a 140 based task->prio, and our 102 based cpupri */ 33/* Convert between a 140 based task->prio, and our 102 based cpupri */
34static int convert_prio(int prio) 34static int convert_prio(int prio)
diff --git a/kernel/sched_cpupri.h b/kernel/sched/cpupri.h
index f6d75617349..f6d75617349 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched/cpupri.h
diff --git a/kernel/sched_debug.c b/kernel/sched/debug.c
index a6710a112b4..2a075e10004 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched/debug.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/time/sched_debug.c 2 * kernel/sched/debug.c
3 * 3 *
4 * Print the CFS rbtree 4 * Print the CFS rbtree
5 * 5 *
@@ -16,6 +16,8 @@
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18 18
19#include "sched.h"
20
19static DEFINE_SPINLOCK(sched_debug_lock); 21static DEFINE_SPINLOCK(sched_debug_lock);
20 22
21/* 23/*
@@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
373 return 0; 375 return 0;
374} 376}
375 377
376static void sysrq_sched_debug_show(void) 378void sysrq_sched_debug_show(void)
377{ 379{
378 sched_debug_show(NULL, NULL); 380 sched_debug_show(NULL, NULL);
379} 381}
diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c
index a78ed2736ba..a4d2b7abc3c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,13 @@
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
29
30#include <trace/events/sched.h>
31
32#include "sched.h"
26 33
27/* 34/*
28 * Targeted preemption latency for CPU-bound tasks: 35 * Targeted preemption latency for CPU-bound tasks:
@@ -103,7 +110,110 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
103unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 110unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
104#endif 111#endif
105 112
106static const struct sched_class fair_sched_class; 113/*
114 * Increase the granularity value when there are more CPUs,
115 * because with more CPUs the 'effective latency' as visible
116 * to users decreases. But the relationship is not linear,
117 * so pick a second-best guess by going with the log2 of the
118 * number of CPUs.
119 *
120 * This idea comes from the SD scheduler of Con Kolivas:
121 */
122static int get_update_sysctl_factor(void)
123{
124 unsigned int cpus = min_t(int, num_online_cpus(), 8);
125 unsigned int factor;
126
127 switch (sysctl_sched_tunable_scaling) {
128 case SCHED_TUNABLESCALING_NONE:
129 factor = 1;
130 break;
131 case SCHED_TUNABLESCALING_LINEAR:
132 factor = cpus;
133 break;
134 case SCHED_TUNABLESCALING_LOG:
135 default:
136 factor = 1 + ilog2(cpus);
137 break;
138 }
139
140 return factor;
141}
142
143static void update_sysctl(void)
144{
145 unsigned int factor = get_update_sysctl_factor();
146
147#define SET_SYSCTL(name) \
148 (sysctl_##name = (factor) * normalized_sysctl_##name)
149 SET_SYSCTL(sched_min_granularity);
150 SET_SYSCTL(sched_latency);
151 SET_SYSCTL(sched_wakeup_granularity);
152#undef SET_SYSCTL
153}
154
155void sched_init_granularity(void)
156{
157 update_sysctl();
158}
159
160#if BITS_PER_LONG == 32
161# define WMULT_CONST (~0UL)
162#else
163# define WMULT_CONST (1UL << 32)
164#endif
165
166#define WMULT_SHIFT 32
167
168/*
169 * Shift right and round:
170 */
171#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
172
173/*
174 * delta *= weight / lw
175 */
176static unsigned long
177calc_delta_mine(unsigned long delta_exec, unsigned long weight,
178 struct load_weight *lw)
179{
180 u64 tmp;
181
182 /*
183 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
184 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
185 * 2^SCHED_LOAD_RESOLUTION.
186 */
187 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
188 tmp = (u64)delta_exec * scale_load_down(weight);
189 else
190 tmp = (u64)delta_exec;
191
192 if (!lw->inv_weight) {
193 unsigned long w = scale_load_down(lw->weight);
194
195 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
196 lw->inv_weight = 1;
197 else if (unlikely(!w))
198 lw->inv_weight = WMULT_CONST;
199 else
200 lw->inv_weight = WMULT_CONST / w;
201 }
202
203 /*
204 * Check whether we'd overflow the 64-bit multiplication:
205 */
206 if (unlikely(tmp > WMULT_CONST))
207 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
208 WMULT_SHIFT/2);
209 else
210 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
211
212 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
213}
214
215
216const struct sched_class fair_sched_class;
107 217
108/************************************************************** 218/**************************************************************
109 * CFS operations on generic schedulable entities: 219 * CFS operations on generic schedulable entities:
@@ -413,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
413 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 523 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
414} 524}
415 525
416static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) 526struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
417{ 527{
418 struct rb_node *left = cfs_rq->rb_leftmost; 528 struct rb_node *left = cfs_rq->rb_leftmost;
419 529
@@ -434,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
434} 544}
435 545
436#ifdef CONFIG_SCHED_DEBUG 546#ifdef CONFIG_SCHED_DEBUG
437static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 547struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
438{ 548{
439 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 549 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
440 550
@@ -684,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
684{ 794{
685 update_load_add(&cfs_rq->load, se->load.weight); 795 update_load_add(&cfs_rq->load, se->load.weight);
686 if (!parent_entity(se)) 796 if (!parent_entity(se))
687 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 797 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
688 if (entity_is_task(se)) { 798 if (entity_is_task(se)) {
689 add_cfs_task_weight(cfs_rq, se->load.weight); 799 add_cfs_task_weight(cfs_rq, se->load.weight);
690 list_add(&se->group_node, &cfs_rq->tasks); 800 list_add(&se->group_node, &cfs_rq->tasks);
@@ -697,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
697{ 807{
698 update_load_sub(&cfs_rq->load, se->load.weight); 808 update_load_sub(&cfs_rq->load, se->load.weight);
699 if (!parent_entity(se)) 809 if (!parent_entity(se))
700 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 810 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
701 if (entity_is_task(se)) { 811 if (entity_is_task(se)) {
702 add_cfs_task_weight(cfs_rq, -se->load.weight); 812 add_cfs_task_weight(cfs_rq, -se->load.weight);
703 list_del_init(&se->group_node); 813 list_del_init(&se->group_node);
@@ -920,6 +1030,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
920 trace_sched_stat_iowait(tsk, delta); 1030 trace_sched_stat_iowait(tsk, delta);
921 } 1031 }
922 1032
1033 trace_sched_stat_blocked(tsk, delta);
1034
923 /* 1035 /*
924 * Blocking time is in units of nanosecs, so shift by 1036 * Blocking time is in units of nanosecs, so shift by
925 * 20 to get a milliseconds-range estimation of the 1037 * 20 to get a milliseconds-range estimation of the
@@ -1287,6 +1399,32 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1287 */ 1399 */
1288 1400
1289#ifdef CONFIG_CFS_BANDWIDTH 1401#ifdef CONFIG_CFS_BANDWIDTH
1402
1403#ifdef HAVE_JUMP_LABEL
1404static struct jump_label_key __cfs_bandwidth_used;
1405
1406static inline bool cfs_bandwidth_used(void)
1407{
1408 return static_branch(&__cfs_bandwidth_used);
1409}
1410
1411void account_cfs_bandwidth_used(int enabled, int was_enabled)
1412{
1413 /* only need to count groups transitioning between enabled/!enabled */
1414 if (enabled && !was_enabled)
1415 jump_label_inc(&__cfs_bandwidth_used);
1416 else if (!enabled && was_enabled)
1417 jump_label_dec(&__cfs_bandwidth_used);
1418}
1419#else /* HAVE_JUMP_LABEL */
1420static bool cfs_bandwidth_used(void)
1421{
1422 return true;
1423}
1424
1425void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
1426#endif /* HAVE_JUMP_LABEL */
1427
1290/* 1428/*
1291 * default period for cfs group bandwidth. 1429 * default period for cfs group bandwidth.
1292 * default: 0.1s, units: nanoseconds 1430 * default: 0.1s, units: nanoseconds
@@ -1308,7 +1446,7 @@ static inline u64 sched_cfs_bandwidth_slice(void)
1308 * 1446 *
1309 * requires cfs_b->lock 1447 * requires cfs_b->lock
1310 */ 1448 */
1311static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) 1449void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1312{ 1450{
1313 u64 now; 1451 u64 now;
1314 1452
@@ -1320,6 +1458,11 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1320 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); 1458 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1321} 1459}
1322 1460
1461static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1462{
1463 return &tg->cfs_bandwidth;
1464}
1465
1323/* returns 0 on failure to allocate runtime */ 1466/* returns 0 on failure to allocate runtime */
1324static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1467static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1325{ 1468{
@@ -1421,7 +1564,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1421static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 1564static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1422 unsigned long delta_exec) 1565 unsigned long delta_exec)
1423{ 1566{
1424 if (!cfs_rq->runtime_enabled) 1567 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
1425 return; 1568 return;
1426 1569
1427 __account_cfs_rq_runtime(cfs_rq, delta_exec); 1570 __account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1429,13 +1572,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1429 1572
1430static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 1573static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1431{ 1574{
1432 return cfs_rq->throttled; 1575 return cfs_bandwidth_used() && cfs_rq->throttled;
1433} 1576}
1434 1577
1435/* check whether cfs_rq, or any parent, is throttled */ 1578/* check whether cfs_rq, or any parent, is throttled */
1436static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) 1579static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1437{ 1580{
1438 return cfs_rq->throttle_count; 1581 return cfs_bandwidth_used() && cfs_rq->throttle_count;
1439} 1582}
1440 1583
1441/* 1584/*
@@ -1530,7 +1673,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1530 raw_spin_unlock(&cfs_b->lock); 1673 raw_spin_unlock(&cfs_b->lock);
1531} 1674}
1532 1675
1533static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) 1676void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1534{ 1677{
1535 struct rq *rq = rq_of(cfs_rq); 1678 struct rq *rq = rq_of(cfs_rq);
1536 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 1679 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
@@ -1756,6 +1899,9 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1756 1899
1757static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1900static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1758{ 1901{
1902 if (!cfs_bandwidth_used())
1903 return;
1904
1759 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) 1905 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
1760 return; 1906 return;
1761 1907
@@ -1801,6 +1947,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1801 */ 1947 */
1802static void check_enqueue_throttle(struct cfs_rq *cfs_rq) 1948static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1803{ 1949{
1950 if (!cfs_bandwidth_used())
1951 return;
1952
1804 /* an active group must be handled by the update_curr()->put() path */ 1953 /* an active group must be handled by the update_curr()->put() path */
1805 if (!cfs_rq->runtime_enabled || cfs_rq->curr) 1954 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1806 return; 1955 return;
@@ -1818,6 +1967,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1818/* conditionally throttle active cfs_rq's from put_prev_entity() */ 1967/* conditionally throttle active cfs_rq's from put_prev_entity() */
1819static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1968static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1820{ 1969{
1970 if (!cfs_bandwidth_used())
1971 return;
1972
1821 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) 1973 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1822 return; 1974 return;
1823 1975
@@ -1830,7 +1982,112 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1830 1982
1831 throttle_cfs_rq(cfs_rq); 1983 throttle_cfs_rq(cfs_rq);
1832} 1984}
1833#else 1985
1986static inline u64 default_cfs_period(void);
1987static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
1988static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
1989
1990static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
1991{
1992 struct cfs_bandwidth *cfs_b =
1993 container_of(timer, struct cfs_bandwidth, slack_timer);
1994 do_sched_cfs_slack_timer(cfs_b);
1995
1996 return HRTIMER_NORESTART;
1997}
1998
1999static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
2000{
2001 struct cfs_bandwidth *cfs_b =
2002 container_of(timer, struct cfs_bandwidth, period_timer);
2003 ktime_t now;
2004 int overrun;
2005 int idle = 0;
2006
2007 for (;;) {
2008 now = hrtimer_cb_get_time(timer);
2009 overrun = hrtimer_forward(timer, now, cfs_b->period);
2010
2011 if (!overrun)
2012 break;
2013
2014 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2015 }
2016
2017 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2018}
2019
2020void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2021{
2022 raw_spin_lock_init(&cfs_b->lock);
2023 cfs_b->runtime = 0;
2024 cfs_b->quota = RUNTIME_INF;
2025 cfs_b->period = ns_to_ktime(default_cfs_period());
2026
2027 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2028 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2029 cfs_b->period_timer.function = sched_cfs_period_timer;
2030 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2031 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2032}
2033
2034static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2035{
2036 cfs_rq->runtime_enabled = 0;
2037 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2038}
2039
2040/* requires cfs_b->lock, may release to reprogram timer */
2041void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2042{
2043 /*
2044 * The timer may be active because we're trying to set a new bandwidth
2045 * period or because we're racing with the tear-down path
2046 * (timer_active==0 becomes visible before the hrtimer call-back
2047 * terminates). In either case we ensure that it's re-programmed
2048 */
2049 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2050 raw_spin_unlock(&cfs_b->lock);
2051 /* ensure cfs_b->lock is available while we wait */
2052 hrtimer_cancel(&cfs_b->period_timer);
2053
2054 raw_spin_lock(&cfs_b->lock);
2055 /* if someone else restarted the timer then we're done */
2056 if (cfs_b->timer_active)
2057 return;
2058 }
2059
2060 cfs_b->timer_active = 1;
2061 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2062}
2063
2064static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2065{
2066 hrtimer_cancel(&cfs_b->period_timer);
2067 hrtimer_cancel(&cfs_b->slack_timer);
2068}
2069
2070void unthrottle_offline_cfs_rqs(struct rq *rq)
2071{
2072 struct cfs_rq *cfs_rq;
2073
2074 for_each_leaf_cfs_rq(rq, cfs_rq) {
2075 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2076
2077 if (!cfs_rq->runtime_enabled)
2078 continue;
2079
2080 /*
2081 * clock_task is not advancing so we just need to make sure
2082 * there's some valid quota amount
2083 */
2084 cfs_rq->runtime_remaining = cfs_b->quota;
2085 if (cfs_rq_throttled(cfs_rq))
2086 unthrottle_cfs_rq(cfs_rq);
2087 }
2088}
2089
2090#else /* CONFIG_CFS_BANDWIDTH */
1834static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 2091static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1835 unsigned long delta_exec) {} 2092 unsigned long delta_exec) {}
1836static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2093static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -1852,8 +2109,22 @@ static inline int throttled_lb_pair(struct task_group *tg,
1852{ 2109{
1853 return 0; 2110 return 0;
1854} 2111}
2112
2113void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2114
2115#ifdef CONFIG_FAIR_GROUP_SCHED
2116static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1855#endif 2117#endif
1856 2118
2119static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2120{
2121 return NULL;
2122}
2123static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2124void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2125
2126#endif /* CONFIG_CFS_BANDWIDTH */
2127
1857/************************************************** 2128/**************************************************
1858 * CFS operations on tasks: 2129 * CFS operations on tasks:
1859 */ 2130 */
@@ -1866,7 +2137,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
1866 2137
1867 WARN_ON(task_rq(p) != rq); 2138 WARN_ON(task_rq(p) != rq);
1868 2139
1869 if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { 2140 if (cfs_rq->nr_running > 1) {
1870 u64 slice = sched_slice(cfs_rq, se); 2141 u64 slice = sched_slice(cfs_rq, se);
1871 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; 2142 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1872 s64 delta = slice - ran; 2143 s64 delta = slice - ran;
@@ -1897,7 +2168,7 @@ static void hrtick_update(struct rq *rq)
1897{ 2168{
1898 struct task_struct *curr = rq->curr; 2169 struct task_struct *curr = rq->curr;
1899 2170
1900 if (curr->sched_class != &fair_sched_class) 2171 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
1901 return; 2172 return;
1902 2173
1903 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) 2174 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
@@ -2020,6 +2291,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2020} 2291}
2021 2292
2022#ifdef CONFIG_SMP 2293#ifdef CONFIG_SMP
2294/* Used instead of source_load when we know the type == 0 */
2295static unsigned long weighted_cpuload(const int cpu)
2296{
2297 return cpu_rq(cpu)->load.weight;
2298}
2299
2300/*
2301 * Return a low guess at the load of a migration-source cpu weighted
2302 * according to the scheduling class and "nice" value.
2303 *
2304 * We want to under-estimate the load of migration sources, to
2305 * balance conservatively.
2306 */
2307static unsigned long source_load(int cpu, int type)
2308{
2309 struct rq *rq = cpu_rq(cpu);
2310 unsigned long total = weighted_cpuload(cpu);
2311
2312 if (type == 0 || !sched_feat(LB_BIAS))
2313 return total;
2314
2315 return min(rq->cpu_load[type-1], total);
2316}
2317
2318/*
2319 * Return a high guess at the load of a migration-target cpu weighted
2320 * according to the scheduling class and "nice" value.
2321 */
2322static unsigned long target_load(int cpu, int type)
2323{
2324 struct rq *rq = cpu_rq(cpu);
2325 unsigned long total = weighted_cpuload(cpu);
2326
2327 if (type == 0 || !sched_feat(LB_BIAS))
2328 return total;
2329
2330 return max(rq->cpu_load[type-1], total);
2331}
2332
2333static unsigned long power_of(int cpu)
2334{
2335 return cpu_rq(cpu)->cpu_power;
2336}
2337
2338static unsigned long cpu_avg_load_per_task(int cpu)
2339{
2340 struct rq *rq = cpu_rq(cpu);
2341 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2342
2343 if (nr_running)
2344 return rq->load.weight / nr_running;
2345
2346 return 0;
2347}
2348
2023 2349
2024static void task_waking_fair(struct task_struct *p) 2350static void task_waking_fair(struct task_struct *p)
2025{ 2351{
@@ -2318,6 +2644,28 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2318 return idlest; 2644 return idlest;
2319} 2645}
2320 2646
2647/**
2648 * highest_flag_domain - Return highest sched_domain containing flag.
2649 * @cpu: The cpu whose highest level of sched domain is to
2650 * be returned.
2651 * @flag: The flag to check for the highest sched_domain
2652 * for the given cpu.
2653 *
2654 * Returns the highest sched_domain of a cpu which contains the given flag.
2655 */
2656static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
2657{
2658 struct sched_domain *sd, *hsd = NULL;
2659
2660 for_each_domain(cpu, sd) {
2661 if (!(sd->flags & flag))
2662 break;
2663 hsd = sd;
2664 }
2665
2666 return hsd;
2667}
2668
2321/* 2669/*
2322 * Try and locate an idle CPU in the sched_domain. 2670 * Try and locate an idle CPU in the sched_domain.
2323 */ 2671 */
@@ -2327,7 +2675,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
2327 int prev_cpu = task_cpu(p); 2675 int prev_cpu = task_cpu(p);
2328 struct sched_domain *sd; 2676 struct sched_domain *sd;
2329 struct sched_group *sg; 2677 struct sched_group *sg;
2330 int i, smt = 0; 2678 int i;
2331 2679
2332 /* 2680 /*
2333 * If the task is going to be woken-up on this cpu and if it is 2681 * If the task is going to be woken-up on this cpu and if it is
@@ -2347,19 +2695,9 @@ static int select_idle_sibling(struct task_struct *p, int target)
2347 * Otherwise, iterate the domains and find an elegible idle cpu. 2695 * Otherwise, iterate the domains and find an elegible idle cpu.
2348 */ 2696 */
2349 rcu_read_lock(); 2697 rcu_read_lock();
2350again:
2351 for_each_domain(target, sd) {
2352 if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
2353 continue;
2354
2355 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) {
2356 if (!smt) {
2357 smt = 1;
2358 goto again;
2359 }
2360 break;
2361 }
2362 2698
2699 sd = highest_flag_domain(target, SD_SHARE_PKG_RESOURCES);
2700 for_each_lower_domain(sd) {
2363 sg = sd->groups; 2701 sg = sd->groups;
2364 do { 2702 do {
2365 if (!cpumask_intersects(sched_group_cpus(sg), 2703 if (!cpumask_intersects(sched_group_cpus(sg),
@@ -2406,6 +2744,9 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2406 int want_sd = 1; 2744 int want_sd = 1;
2407 int sync = wake_flags & WF_SYNC; 2745 int sync = wake_flags & WF_SYNC;
2408 2746
2747 if (p->rt.nr_cpus_allowed == 1)
2748 return prev_cpu;
2749
2409 if (sd_flag & SD_BALANCE_WAKE) { 2750 if (sd_flag & SD_BALANCE_WAKE) {
2410 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) 2751 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
2411 want_affine = 1; 2752 want_affine = 1;
@@ -2690,7 +3031,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
2690 } while (cfs_rq); 3031 } while (cfs_rq);
2691 3032
2692 p = task_of(se); 3033 p = task_of(se);
2693 hrtick_start_fair(rq, p); 3034 if (hrtick_enabled(rq))
3035 hrtick_start_fair(rq, p);
2694 3036
2695 return p; 3037 return p;
2696} 3038}
@@ -2734,6 +3076,12 @@ static void yield_task_fair(struct rq *rq)
2734 * Update run-time statistics of the 'current'. 3076 * Update run-time statistics of the 'current'.
2735 */ 3077 */
2736 update_curr(cfs_rq); 3078 update_curr(cfs_rq);
3079 /*
3080 * Tell update_rq_clock() that we've just updated,
3081 * so we don't do microscopic update in schedule()
3082 * and double the fastpath cost.
3083 */
3084 rq->skip_clock_update = 1;
2737 } 3085 }
2738 3086
2739 set_skip_buddy(se); 3087 set_skip_buddy(se);
@@ -2774,6 +3122,38 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2774} 3122}
2775 3123
2776/* 3124/*
3125 * Is this task likely cache-hot:
3126 */
3127static int
3128task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3129{
3130 s64 delta;
3131
3132 if (p->sched_class != &fair_sched_class)
3133 return 0;
3134
3135 if (unlikely(p->policy == SCHED_IDLE))
3136 return 0;
3137
3138 /*
3139 * Buddy candidates are cache hot:
3140 */
3141 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3142 (&p->se == cfs_rq_of(&p->se)->next ||
3143 &p->se == cfs_rq_of(&p->se)->last))
3144 return 1;
3145
3146 if (sysctl_sched_migration_cost == -1)
3147 return 1;
3148 if (sysctl_sched_migration_cost == 0)
3149 return 0;
3150
3151 delta = now - p->se.exec_start;
3152
3153 return delta < (s64)sysctl_sched_migration_cost;
3154}
3155
3156/*
2777 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 3157 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2778 */ 3158 */
2779static 3159static
@@ -3153,15 +3533,6 @@ struct sg_lb_stats {
3153}; 3533};
3154 3534
3155/** 3535/**
3156 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3157 * @group: The group whose first cpu is to be returned.
3158 */
3159static inline unsigned int group_first_cpu(struct sched_group *group)
3160{
3161 return cpumask_first(sched_group_cpus(group));
3162}
3163
3164/**
3165 * get_sd_load_idx - Obtain the load index for a given sched domain. 3536 * get_sd_load_idx - Obtain the load index for a given sched domain.
3166 * @sd: The sched_domain whose load_idx is to be obtained. 3537 * @sd: The sched_domain whose load_idx is to be obtained.
3167 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 3538 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
@@ -3410,7 +3781,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
3410 sdg->sgp->power = power; 3781 sdg->sgp->power = power;
3411} 3782}
3412 3783
3413static void update_group_power(struct sched_domain *sd, int cpu) 3784void update_group_power(struct sched_domain *sd, int cpu)
3414{ 3785{
3415 struct sched_domain *child = sd->child; 3786 struct sched_domain *child = sd->child;
3416 struct sched_group *group, *sdg = sd->groups; 3787 struct sched_group *group, *sdg = sd->groups;
@@ -3676,11 +4047,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3676 } while (sg != sd->groups); 4047 } while (sg != sd->groups);
3677} 4048}
3678 4049
3679int __weak arch_sd_sibling_asym_packing(void)
3680{
3681 return 0*SD_ASYM_PACKING;
3682}
3683
3684/** 4050/**
3685 * check_asym_packing - Check to see if the group is packed into the 4051 * check_asym_packing - Check to see if the group is packed into the
3686 * sched doman. 4052 * sched doman.
@@ -4044,7 +4410,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4044#define MAX_PINNED_INTERVAL 512 4410#define MAX_PINNED_INTERVAL 512
4045 4411
4046/* Working cpumask for load_balance and load_balance_newidle. */ 4412/* Working cpumask for load_balance and load_balance_newidle. */
4047static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4413DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4048 4414
4049static int need_active_balance(struct sched_domain *sd, int idle, 4415static int need_active_balance(struct sched_domain *sd, int idle,
4050 int busiest_cpu, int this_cpu) 4416 int busiest_cpu, int this_cpu)
@@ -4247,7 +4613,7 @@ out:
4247 * idle_balance is called by schedule() if this_cpu is about to become 4613 * idle_balance is called by schedule() if this_cpu is about to become
4248 * idle. Attempts to pull tasks from other CPUs. 4614 * idle. Attempts to pull tasks from other CPUs.
4249 */ 4615 */
4250static void idle_balance(int this_cpu, struct rq *this_rq) 4616void idle_balance(int this_cpu, struct rq *this_rq)
4251{ 4617{
4252 struct sched_domain *sd; 4618 struct sched_domain *sd;
4253 int pulled_task = 0; 4619 int pulled_task = 0;
@@ -4362,28 +4728,16 @@ out_unlock:
4362#ifdef CONFIG_NO_HZ 4728#ifdef CONFIG_NO_HZ
4363/* 4729/*
4364 * idle load balancing details 4730 * idle load balancing details
4365 * - One of the idle CPUs nominates itself as idle load_balancer, while
4366 * entering idle.
4367 * - This idle load balancer CPU will also go into tickless mode when
4368 * it is idle, just like all other idle CPUs
4369 * - When one of the busy CPUs notice that there may be an idle rebalancing 4731 * - When one of the busy CPUs notice that there may be an idle rebalancing
4370 * needed, they will kick the idle load balancer, which then does idle 4732 * needed, they will kick the idle load balancer, which then does idle
4371 * load balancing for all the idle CPUs. 4733 * load balancing for all the idle CPUs.
4372 */ 4734 */
4373static struct { 4735static struct {
4374 atomic_t load_balancer;
4375 atomic_t first_pick_cpu;
4376 atomic_t second_pick_cpu;
4377 cpumask_var_t idle_cpus_mask; 4736 cpumask_var_t idle_cpus_mask;
4378 cpumask_var_t grp_idle_mask; 4737 atomic_t nr_cpus;
4379 unsigned long next_balance; /* in jiffy units */ 4738 unsigned long next_balance; /* in jiffy units */
4380} nohz ____cacheline_aligned; 4739} nohz ____cacheline_aligned;
4381 4740
4382int get_nohz_load_balancer(void)
4383{
4384 return atomic_read(&nohz.load_balancer);
4385}
4386
4387#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4741#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4388/** 4742/**
4389 * lowest_flag_domain - Return lowest sched_domain containing flag. 4743 * lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -4420,33 +4774,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4420 (sd && (sd->flags & flag)); sd = sd->parent) 4774 (sd && (sd->flags & flag)); sd = sd->parent)
4421 4775
4422/** 4776/**
4423 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4424 * @ilb_group: group to be checked for semi-idleness
4425 *
4426 * Returns: 1 if the group is semi-idle. 0 otherwise.
4427 *
4428 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4429 * and atleast one non-idle CPU. This helper function checks if the given
4430 * sched_group is semi-idle or not.
4431 */
4432static inline int is_semi_idle_group(struct sched_group *ilb_group)
4433{
4434 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
4435 sched_group_cpus(ilb_group));
4436
4437 /*
4438 * A sched_group is semi-idle when it has atleast one busy cpu
4439 * and atleast one idle cpu.
4440 */
4441 if (cpumask_empty(nohz.grp_idle_mask))
4442 return 0;
4443
4444 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
4445 return 0;
4446
4447 return 1;
4448}
4449/**
4450 * find_new_ilb - Finds the optimum idle load balancer for nomination. 4777 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4451 * @cpu: The cpu which is nominating a new idle_load_balancer. 4778 * @cpu: The cpu which is nominating a new idle_load_balancer.
4452 * 4779 *
@@ -4460,9 +4787,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group)
4460 */ 4787 */
4461static int find_new_ilb(int cpu) 4788static int find_new_ilb(int cpu)
4462{ 4789{
4790 int ilb = cpumask_first(nohz.idle_cpus_mask);
4791 struct sched_group *ilbg;
4463 struct sched_domain *sd; 4792 struct sched_domain *sd;
4464 struct sched_group *ilb_group;
4465 int ilb = nr_cpu_ids;
4466 4793
4467 /* 4794 /*
4468 * Have idle load balancer selection from semi-idle packages only 4795 * Have idle load balancer selection from semi-idle packages only
@@ -4480,23 +4807,28 @@ static int find_new_ilb(int cpu)
4480 4807
4481 rcu_read_lock(); 4808 rcu_read_lock();
4482 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 4809 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4483 ilb_group = sd->groups; 4810 ilbg = sd->groups;
4484 4811
4485 do { 4812 do {
4486 if (is_semi_idle_group(ilb_group)) { 4813 if (ilbg->group_weight !=
4487 ilb = cpumask_first(nohz.grp_idle_mask); 4814 atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4815 ilb = cpumask_first_and(nohz.idle_cpus_mask,
4816 sched_group_cpus(ilbg));
4488 goto unlock; 4817 goto unlock;
4489 } 4818 }
4490 4819
4491 ilb_group = ilb_group->next; 4820 ilbg = ilbg->next;
4492 4821
4493 } while (ilb_group != sd->groups); 4822 } while (ilbg != sd->groups);
4494 } 4823 }
4495unlock: 4824unlock:
4496 rcu_read_unlock(); 4825 rcu_read_unlock();
4497 4826
4498out_done: 4827out_done:
4499 return ilb; 4828 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4829 return ilb;
4830
4831 return nr_cpu_ids;
4500} 4832}
4501#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 4833#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4502static inline int find_new_ilb(int call_cpu) 4834static inline int find_new_ilb(int call_cpu)
@@ -4516,99 +4848,68 @@ static void nohz_balancer_kick(int cpu)
4516 4848
4517 nohz.next_balance++; 4849 nohz.next_balance++;
4518 4850
4519 ilb_cpu = get_nohz_load_balancer(); 4851 ilb_cpu = find_new_ilb(cpu);
4520
4521 if (ilb_cpu >= nr_cpu_ids) {
4522 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
4523 if (ilb_cpu >= nr_cpu_ids)
4524 return;
4525 }
4526 4852
4527 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { 4853 if (ilb_cpu >= nr_cpu_ids)
4528 cpu_rq(ilb_cpu)->nohz_balance_kick = 1; 4854 return;
4529 4855
4530 smp_mb(); 4856 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
4531 /* 4857 return;
4532 * Use smp_send_reschedule() instead of resched_cpu(). 4858 /*
4533 * This way we generate a sched IPI on the target cpu which 4859 * Use smp_send_reschedule() instead of resched_cpu().
4534 * is idle. And the softirq performing nohz idle load balance 4860 * This way we generate a sched IPI on the target cpu which
4535 * will be run before returning from the IPI. 4861 * is idle. And the softirq performing nohz idle load balance
4536 */ 4862 * will be run before returning from the IPI.
4537 smp_send_reschedule(ilb_cpu); 4863 */
4538 } 4864 smp_send_reschedule(ilb_cpu);
4539 return; 4865 return;
4540} 4866}
4541 4867
4542/* 4868static inline void set_cpu_sd_state_busy(void)
4543 * This routine will try to nominate the ilb (idle load balancing)
4544 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4545 * load balancing on behalf of all those cpus.
4546 *
4547 * When the ilb owner becomes busy, we will not have new ilb owner until some
4548 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
4549 * idle load balancing by kicking one of the idle CPUs.
4550 *
4551 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
4552 * ilb owner CPU in future (when there is a need for idle load balancing on
4553 * behalf of all idle CPUs).
4554 */
4555void select_nohz_load_balancer(int stop_tick)
4556{ 4869{
4870 struct sched_domain *sd;
4557 int cpu = smp_processor_id(); 4871 int cpu = smp_processor_id();
4558 4872
4559 if (stop_tick) { 4873 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4560 if (!cpu_active(cpu)) { 4874 return;
4561 if (atomic_read(&nohz.load_balancer) != cpu) 4875 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
4562 return;
4563
4564 /*
4565 * If we are going offline and still the leader,
4566 * give up!
4567 */
4568 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
4569 nr_cpu_ids) != cpu)
4570 BUG();
4571 4876
4572 return; 4877 rcu_read_lock();
4573 } 4878 for_each_domain(cpu, sd)
4879 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
4880 rcu_read_unlock();
4881}
4574 4882
4575 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 4883void set_cpu_sd_state_idle(void)
4884{
4885 struct sched_domain *sd;
4886 int cpu = smp_processor_id();
4576 4887
4577 if (atomic_read(&nohz.first_pick_cpu) == cpu) 4888 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4578 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); 4889 return;
4579 if (atomic_read(&nohz.second_pick_cpu) == cpu) 4890 set_bit(NOHZ_IDLE, nohz_flags(cpu));
4580 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
4581 4891
4582 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { 4892 rcu_read_lock();
4583 int new_ilb; 4893 for_each_domain(cpu, sd)
4894 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
4895 rcu_read_unlock();
4896}
4584 4897
4585 /* make me the ilb owner */ 4898/*
4586 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, 4899 * This routine will record that this cpu is going idle with tick stopped.
4587 cpu) != nr_cpu_ids) 4900 * This info will be used in performing idle load balancing in the future.
4588 return; 4901 */
4902void select_nohz_load_balancer(int stop_tick)
4903{
4904 int cpu = smp_processor_id();
4589 4905
4590 /* 4906 if (stop_tick) {
4591 * Check to see if there is a more power-efficient 4907 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4592 * ilb.
4593 */
4594 new_ilb = find_new_ilb(cpu);
4595 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4596 atomic_set(&nohz.load_balancer, nr_cpu_ids);
4597 resched_cpu(new_ilb);
4598 return;
4599 }
4600 return;
4601 }
4602 } else {
4603 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
4604 return; 4908 return;
4605 4909
4606 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 4910 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4607 4911 atomic_inc(&nohz.nr_cpus);
4608 if (atomic_read(&nohz.load_balancer) == cpu) 4912 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4609 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
4610 nr_cpu_ids) != cpu)
4611 BUG();
4612 } 4913 }
4613 return; 4914 return;
4614} 4915}
@@ -4622,7 +4923,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4622 * Scale the max load_balance interval with the number of CPUs in the system. 4923 * Scale the max load_balance interval with the number of CPUs in the system.
4623 * This trades load-balance latency on larger machines for less cross talk. 4924 * This trades load-balance latency on larger machines for less cross talk.
4624 */ 4925 */
4625static void update_max_interval(void) 4926void update_max_interval(void)
4626{ 4927{
4627 max_load_balance_interval = HZ*num_online_cpus()/10; 4928 max_load_balance_interval = HZ*num_online_cpus()/10;
4628} 4929}
@@ -4714,11 +5015,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4714 struct rq *rq; 5015 struct rq *rq;
4715 int balance_cpu; 5016 int balance_cpu;
4716 5017
4717 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) 5018 if (idle != CPU_IDLE ||
4718 return; 5019 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5020 goto end;
4719 5021
4720 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { 5022 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
4721 if (balance_cpu == this_cpu) 5023 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
4722 continue; 5024 continue;
4723 5025
4724 /* 5026 /*
@@ -4726,10 +5028,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4726 * work being done for other cpus. Next load 5028 * work being done for other cpus. Next load
4727 * balancing owner will pick it up. 5029 * balancing owner will pick it up.
4728 */ 5030 */
4729 if (need_resched()) { 5031 if (need_resched())
4730 this_rq->nohz_balance_kick = 0;
4731 break; 5032 break;
4732 }
4733 5033
4734 raw_spin_lock_irq(&this_rq->lock); 5034 raw_spin_lock_irq(&this_rq->lock);
4735 update_rq_clock(this_rq); 5035 update_rq_clock(this_rq);
@@ -4743,53 +5043,75 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4743 this_rq->next_balance = rq->next_balance; 5043 this_rq->next_balance = rq->next_balance;
4744 } 5044 }
4745 nohz.next_balance = this_rq->next_balance; 5045 nohz.next_balance = this_rq->next_balance;
4746 this_rq->nohz_balance_kick = 0; 5046end:
5047 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
4747} 5048}
4748 5049
4749/* 5050/*
4750 * Current heuristic for kicking the idle load balancer 5051 * Current heuristic for kicking the idle load balancer in the presence
4751 * - first_pick_cpu is the one of the busy CPUs. It will kick 5052 * of an idle cpu is the system.
4752 * idle load balancer when it has more than one process active. This 5053 * - This rq has more than one task.
4753 * eliminates the need for idle load balancing altogether when we have 5054 * - At any scheduler domain level, this cpu's scheduler group has multiple
4754 * only one running process in the system (common case). 5055 * busy cpu's exceeding the group's power.
4755 * - If there are more than one busy CPU, idle load balancer may have 5056 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
4756 * to run for active_load_balance to happen (i.e., two busy CPUs are 5057 * domain span are idle.
4757 * SMT or core siblings and can run better if they move to different
4758 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
4759 * which will kick idle load balancer as soon as it has any load.
4760 */ 5058 */
4761static inline int nohz_kick_needed(struct rq *rq, int cpu) 5059static inline int nohz_kick_needed(struct rq *rq, int cpu)
4762{ 5060{
4763 unsigned long now = jiffies; 5061 unsigned long now = jiffies;
4764 int ret; 5062 struct sched_domain *sd;
4765 int first_pick_cpu, second_pick_cpu;
4766 5063
4767 if (time_before(now, nohz.next_balance)) 5064 if (unlikely(idle_cpu(cpu)))
4768 return 0; 5065 return 0;
4769 5066
4770 if (idle_cpu(cpu)) 5067 /*
4771 return 0; 5068 * We may be recently in ticked or tickless idle mode. At the first
5069 * busy tick after returning from idle, we will update the busy stats.
5070 */
5071 set_cpu_sd_state_busy();
5072 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5073 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5074 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5075 atomic_dec(&nohz.nr_cpus);
5076 }
4772 5077
4773 first_pick_cpu = atomic_read(&nohz.first_pick_cpu); 5078 /*
4774 second_pick_cpu = atomic_read(&nohz.second_pick_cpu); 5079 * None are in tickless mode and hence no need for NOHZ idle load
5080 * balancing.
5081 */
5082 if (likely(!atomic_read(&nohz.nr_cpus)))
5083 return 0;
4775 5084
4776 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && 5085 if (time_before(now, nohz.next_balance))
4777 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
4778 return 0; 5086 return 0;
4779 5087
4780 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); 5088 if (rq->nr_running >= 2)
4781 if (ret == nr_cpu_ids || ret == cpu) { 5089 goto need_kick;
4782 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); 5090
4783 if (rq->nr_running > 1) 5091 rcu_read_lock();
4784 return 1; 5092 for_each_domain(cpu, sd) {
4785 } else { 5093 struct sched_group *sg = sd->groups;
4786 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); 5094 struct sched_group_power *sgp = sg->sgp;
4787 if (ret == nr_cpu_ids || ret == cpu) { 5095 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
4788 if (rq->nr_running) 5096
4789 return 1; 5097 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
4790 } 5098 goto need_kick_unlock;
5099
5100 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5101 && (cpumask_first_and(nohz.idle_cpus_mask,
5102 sched_domain_span(sd)) < cpu))
5103 goto need_kick_unlock;
5104
5105 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5106 break;
4791 } 5107 }
5108 rcu_read_unlock();
4792 return 0; 5109 return 0;
5110
5111need_kick_unlock:
5112 rcu_read_unlock();
5113need_kick:
5114 return 1;
4793} 5115}
4794#else 5116#else
4795static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } 5117static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
@@ -4824,14 +5146,14 @@ static inline int on_null_domain(int cpu)
4824/* 5146/*
4825 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 5147 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4826 */ 5148 */
4827static inline void trigger_load_balance(struct rq *rq, int cpu) 5149void trigger_load_balance(struct rq *rq, int cpu)
4828{ 5150{
4829 /* Don't need to rebalance while attached to NULL domain */ 5151 /* Don't need to rebalance while attached to NULL domain */
4830 if (time_after_eq(jiffies, rq->next_balance) && 5152 if (time_after_eq(jiffies, rq->next_balance) &&
4831 likely(!on_null_domain(cpu))) 5153 likely(!on_null_domain(cpu)))
4832 raise_softirq(SCHED_SOFTIRQ); 5154 raise_softirq(SCHED_SOFTIRQ);
4833#ifdef CONFIG_NO_HZ 5155#ifdef CONFIG_NO_HZ
4834 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 5156 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
4835 nohz_balancer_kick(cpu); 5157 nohz_balancer_kick(cpu);
4836#endif 5158#endif
4837} 5159}
@@ -4846,15 +5168,6 @@ static void rq_offline_fair(struct rq *rq)
4846 update_sysctl(); 5168 update_sysctl();
4847} 5169}
4848 5170
4849#else /* CONFIG_SMP */
4850
4851/*
4852 * on UP we do not need to balance between CPUs:
4853 */
4854static inline void idle_balance(int cpu, struct rq *rq)
4855{
4856}
4857
4858#endif /* CONFIG_SMP */ 5171#endif /* CONFIG_SMP */
4859 5172
4860/* 5173/*
@@ -4997,6 +5310,16 @@ static void set_curr_task_fair(struct rq *rq)
4997 } 5310 }
4998} 5311}
4999 5312
5313void init_cfs_rq(struct cfs_rq *cfs_rq)
5314{
5315 cfs_rq->tasks_timeline = RB_ROOT;
5316 INIT_LIST_HEAD(&cfs_rq->tasks);
5317 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5318#ifndef CONFIG_64BIT
5319 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5320#endif
5321}
5322
5000#ifdef CONFIG_FAIR_GROUP_SCHED 5323#ifdef CONFIG_FAIR_GROUP_SCHED
5001static void task_move_group_fair(struct task_struct *p, int on_rq) 5324static void task_move_group_fair(struct task_struct *p, int on_rq)
5002{ 5325{
@@ -5019,7 +5342,161 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
5019 if (!on_rq) 5342 if (!on_rq)
5020 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; 5343 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
5021} 5344}
5345
5346void free_fair_sched_group(struct task_group *tg)
5347{
5348 int i;
5349
5350 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5351
5352 for_each_possible_cpu(i) {
5353 if (tg->cfs_rq)
5354 kfree(tg->cfs_rq[i]);
5355 if (tg->se)
5356 kfree(tg->se[i]);
5357 }
5358
5359 kfree(tg->cfs_rq);
5360 kfree(tg->se);
5361}
5362
5363int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5364{
5365 struct cfs_rq *cfs_rq;
5366 struct sched_entity *se;
5367 int i;
5368
5369 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5370 if (!tg->cfs_rq)
5371 goto err;
5372 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5373 if (!tg->se)
5374 goto err;
5375
5376 tg->shares = NICE_0_LOAD;
5377
5378 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5379
5380 for_each_possible_cpu(i) {
5381 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5382 GFP_KERNEL, cpu_to_node(i));
5383 if (!cfs_rq)
5384 goto err;
5385
5386 se = kzalloc_node(sizeof(struct sched_entity),
5387 GFP_KERNEL, cpu_to_node(i));
5388 if (!se)
5389 goto err_free_rq;
5390
5391 init_cfs_rq(cfs_rq);
5392 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5393 }
5394
5395 return 1;
5396
5397err_free_rq:
5398 kfree(cfs_rq);
5399err:
5400 return 0;
5401}
5402
5403void unregister_fair_sched_group(struct task_group *tg, int cpu)
5404{
5405 struct rq *rq = cpu_rq(cpu);
5406 unsigned long flags;
5407
5408 /*
5409 * Only empty task groups can be destroyed; so we can speculatively
5410 * check on_list without danger of it being re-added.
5411 */
5412 if (!tg->cfs_rq[cpu]->on_list)
5413 return;
5414
5415 raw_spin_lock_irqsave(&rq->lock, flags);
5416 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
5417 raw_spin_unlock_irqrestore(&rq->lock, flags);
5418}
5419
5420void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5421 struct sched_entity *se, int cpu,
5422 struct sched_entity *parent)
5423{
5424 struct rq *rq = cpu_rq(cpu);
5425
5426 cfs_rq->tg = tg;
5427 cfs_rq->rq = rq;
5428#ifdef CONFIG_SMP
5429 /* allow initial update_cfs_load() to truncate */
5430 cfs_rq->load_stamp = 1;
5022#endif 5431#endif
5432 init_cfs_rq_runtime(cfs_rq);
5433
5434 tg->cfs_rq[cpu] = cfs_rq;
5435 tg->se[cpu] = se;
5436
5437 /* se could be NULL for root_task_group */
5438 if (!se)
5439 return;
5440
5441 if (!parent)
5442 se->cfs_rq = &rq->cfs;
5443 else
5444 se->cfs_rq = parent->my_q;
5445
5446 se->my_q = cfs_rq;
5447 update_load_set(&se->load, 0);
5448 se->parent = parent;
5449}
5450
5451static DEFINE_MUTEX(shares_mutex);
5452
5453int sched_group_set_shares(struct task_group *tg, unsigned long shares)
5454{
5455 int i;
5456 unsigned long flags;
5457
5458 /*
5459 * We can't change the weight of the root cgroup.
5460 */
5461 if (!tg->se[0])
5462 return -EINVAL;
5463
5464 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
5465
5466 mutex_lock(&shares_mutex);
5467 if (tg->shares == shares)
5468 goto done;
5469
5470 tg->shares = shares;
5471 for_each_possible_cpu(i) {
5472 struct rq *rq = cpu_rq(i);
5473 struct sched_entity *se;
5474
5475 se = tg->se[i];
5476 /* Propagate contribution to hierarchy */
5477 raw_spin_lock_irqsave(&rq->lock, flags);
5478 for_each_sched_entity(se)
5479 update_cfs_shares(group_cfs_rq(se));
5480 raw_spin_unlock_irqrestore(&rq->lock, flags);
5481 }
5482
5483done:
5484 mutex_unlock(&shares_mutex);
5485 return 0;
5486}
5487#else /* CONFIG_FAIR_GROUP_SCHED */
5488
5489void free_fair_sched_group(struct task_group *tg) { }
5490
5491int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5492{
5493 return 1;
5494}
5495
5496void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
5497
5498#endif /* CONFIG_FAIR_GROUP_SCHED */
5499
5023 5500
5024static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 5501static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
5025{ 5502{
@@ -5039,7 +5516,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
5039/* 5516/*
5040 * All the scheduling class methods: 5517 * All the scheduling class methods:
5041 */ 5518 */
5042static const struct sched_class fair_sched_class = { 5519const struct sched_class fair_sched_class = {
5043 .next = &idle_sched_class, 5520 .next = &idle_sched_class,
5044 .enqueue_task = enqueue_task_fair, 5521 .enqueue_task = enqueue_task_fair,
5045 .dequeue_task = dequeue_task_fair, 5522 .dequeue_task = dequeue_task_fair,
@@ -5076,7 +5553,7 @@ static const struct sched_class fair_sched_class = {
5076}; 5553};
5077 5554
5078#ifdef CONFIG_SCHED_DEBUG 5555#ifdef CONFIG_SCHED_DEBUG
5079static void print_cfs_stats(struct seq_file *m, int cpu) 5556void print_cfs_stats(struct seq_file *m, int cpu)
5080{ 5557{
5081 struct cfs_rq *cfs_rq; 5558 struct cfs_rq *cfs_rq;
5082 5559
@@ -5086,3 +5563,15 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
5086 rcu_read_unlock(); 5563 rcu_read_unlock();
5087} 5564}
5088#endif 5565#endif
5566
5567__init void init_sched_fair_class(void)
5568{
5569#ifdef CONFIG_SMP
5570 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
5571
5572#ifdef CONFIG_NO_HZ
5573 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
5574#endif
5575#endif /* SMP */
5576
5577}
diff --git a/kernel/sched_features.h b/kernel/sched/features.h
index 84802245abd..e61fd73913d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched/features.h
@@ -3,13 +3,13 @@
3 * them to run sooner, but does not allow tons of sleepers to 3 * them to run sooner, but does not allow tons of sleepers to
4 * rip the spread apart. 4 * rip the spread apart.
5 */ 5 */
6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) 6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
7 7
8/* 8/*
9 * Place new tasks ahead so that they do not starve already running 9 * Place new tasks ahead so that they do not starve already running
10 * tasks 10 * tasks
11 */ 11 */
12SCHED_FEAT(START_DEBIT, 1) 12SCHED_FEAT(START_DEBIT, true)
13 13
14/* 14/*
15 * Based on load and program behaviour, see if it makes sense to place 15 * Based on load and program behaviour, see if it makes sense to place
@@ -17,54 +17,54 @@ SCHED_FEAT(START_DEBIT, 1)
17 * improve cache locality. Typically used with SYNC wakeups as 17 * improve cache locality. Typically used with SYNC wakeups as
18 * generated by pipes and the like, see also SYNC_WAKEUPS. 18 * generated by pipes and the like, see also SYNC_WAKEUPS.
19 */ 19 */
20SCHED_FEAT(AFFINE_WAKEUPS, 1) 20SCHED_FEAT(AFFINE_WAKEUPS, true)
21 21
22/* 22/*
23 * Prefer to schedule the task we woke last (assuming it failed 23 * Prefer to schedule the task we woke last (assuming it failed
24 * wakeup-preemption), since its likely going to consume data we 24 * wakeup-preemption), since its likely going to consume data we
25 * touched, increases cache locality. 25 * touched, increases cache locality.
26 */ 26 */
27SCHED_FEAT(NEXT_BUDDY, 0) 27SCHED_FEAT(NEXT_BUDDY, false)
28 28
29/* 29/*
30 * Prefer to schedule the task that ran last (when we did 30 * Prefer to schedule the task that ran last (when we did
31 * wake-preempt) as that likely will touch the same data, increases 31 * wake-preempt) as that likely will touch the same data, increases
32 * cache locality. 32 * cache locality.
33 */ 33 */
34SCHED_FEAT(LAST_BUDDY, 1) 34SCHED_FEAT(LAST_BUDDY, true)
35 35
36/* 36/*
37 * Consider buddies to be cache hot, decreases the likelyness of a 37 * Consider buddies to be cache hot, decreases the likelyness of a
38 * cache buddy being migrated away, increases cache locality. 38 * cache buddy being migrated away, increases cache locality.
39 */ 39 */
40SCHED_FEAT(CACHE_HOT_BUDDY, 1) 40SCHED_FEAT(CACHE_HOT_BUDDY, true)
41 41
42/* 42/*
43 * Use arch dependent cpu power functions 43 * Use arch dependent cpu power functions
44 */ 44 */
45SCHED_FEAT(ARCH_POWER, 0) 45SCHED_FEAT(ARCH_POWER, false)
46 46
47SCHED_FEAT(HRTICK, 0) 47SCHED_FEAT(HRTICK, false)
48SCHED_FEAT(DOUBLE_TICK, 0) 48SCHED_FEAT(DOUBLE_TICK, false)
49SCHED_FEAT(LB_BIAS, 1) 49SCHED_FEAT(LB_BIAS, true)
50 50
51/* 51/*
52 * Spin-wait on mutex acquisition when the mutex owner is running on 52 * Spin-wait on mutex acquisition when the mutex owner is running on
53 * another cpu -- assumes that when the owner is running, it will soon 53 * another cpu -- assumes that when the owner is running, it will soon
54 * release the lock. Decreases scheduling overhead. 54 * release the lock. Decreases scheduling overhead.
55 */ 55 */
56SCHED_FEAT(OWNER_SPIN, 1) 56SCHED_FEAT(OWNER_SPIN, true)
57 57
58/* 58/*
59 * Decrement CPU power based on time not spent running tasks 59 * Decrement CPU power based on time not spent running tasks
60 */ 60 */
61SCHED_FEAT(NONTASK_POWER, 1) 61SCHED_FEAT(NONTASK_POWER, true)
62 62
63/* 63/*
64 * Queue remote wakeups on the target CPU and process them 64 * Queue remote wakeups on the target CPU and process them
65 * using the scheduler IPI. Reduces rq->lock contention/bounces. 65 * using the scheduler IPI. Reduces rq->lock contention/bounces.
66 */ 66 */
67SCHED_FEAT(TTWU_QUEUE, 1) 67SCHED_FEAT(TTWU_QUEUE, true)
68 68
69SCHED_FEAT(FORCE_SD_OVERLAP, 0) 69SCHED_FEAT(FORCE_SD_OVERLAP, false)
70SCHED_FEAT(RT_RUNTIME_SHARE, 1) 70SCHED_FEAT(RT_RUNTIME_SHARE, true)
diff --git a/kernel/sched_idletask.c b/kernel/sched/idle_task.c
index 0a51882534e..91b4c957f28 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched/idle_task.c
@@ -1,3 +1,5 @@
1#include "sched.h"
2
1/* 3/*
2 * idle-task scheduling class. 4 * idle-task scheduling class.
3 * 5 *
@@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
71/* 73/*
72 * Simple, special scheduling class for the per-CPU idle tasks: 74 * Simple, special scheduling class for the per-CPU idle tasks:
73 */ 75 */
74static const struct sched_class idle_sched_class = { 76const struct sched_class idle_sched_class = {
75 /* .next is NULL */ 77 /* .next is NULL */
76 /* no enqueue/yield_task for idle tasks */ 78 /* no enqueue/yield_task for idle tasks */
77 79
diff --git a/kernel/sched_rt.c b/kernel/sched/rt.c
index 583a1368afe..3640ebbb466 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched/rt.c
@@ -3,7 +3,92 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#include "sched.h"
7
8#include <linux/slab.h>
9
10static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
11
12struct rt_bandwidth def_rt_bandwidth;
13
14static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
15{
16 struct rt_bandwidth *rt_b =
17 container_of(timer, struct rt_bandwidth, rt_period_timer);
18 ktime_t now;
19 int overrun;
20 int idle = 0;
21
22 for (;;) {
23 now = hrtimer_cb_get_time(timer);
24 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
25
26 if (!overrun)
27 break;
28
29 idle = do_sched_rt_period_timer(rt_b, overrun);
30 }
31
32 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
33}
34
35void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
36{
37 rt_b->rt_period = ns_to_ktime(period);
38 rt_b->rt_runtime = runtime;
39
40 raw_spin_lock_init(&rt_b->rt_runtime_lock);
41
42 hrtimer_init(&rt_b->rt_period_timer,
43 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
44 rt_b->rt_period_timer.function = sched_rt_period_timer;
45}
46
47static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
48{
49 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
50 return;
51
52 if (hrtimer_active(&rt_b->rt_period_timer))
53 return;
54
55 raw_spin_lock(&rt_b->rt_runtime_lock);
56 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
57 raw_spin_unlock(&rt_b->rt_runtime_lock);
58}
59
60void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
61{
62 struct rt_prio_array *array;
63 int i;
64
65 array = &rt_rq->active;
66 for (i = 0; i < MAX_RT_PRIO; i++) {
67 INIT_LIST_HEAD(array->queue + i);
68 __clear_bit(i, array->bitmap);
69 }
70 /* delimiter for bitsearch: */
71 __set_bit(MAX_RT_PRIO, array->bitmap);
72
73#if defined CONFIG_SMP
74 rt_rq->highest_prio.curr = MAX_RT_PRIO;
75 rt_rq->highest_prio.next = MAX_RT_PRIO;
76 rt_rq->rt_nr_migratory = 0;
77 rt_rq->overloaded = 0;
78 plist_head_init(&rt_rq->pushable_tasks);
79#endif
80
81 rt_rq->rt_time = 0;
82 rt_rq->rt_throttled = 0;
83 rt_rq->rt_runtime = 0;
84 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
85}
86
6#ifdef CONFIG_RT_GROUP_SCHED 87#ifdef CONFIG_RT_GROUP_SCHED
88static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
89{
90 hrtimer_cancel(&rt_b->rt_period_timer);
91}
7 92
8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 93#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
9 94
@@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
25 return rt_se->rt_rq; 110 return rt_se->rt_rq;
26} 111}
27 112
113void free_rt_sched_group(struct task_group *tg)
114{
115 int i;
116
117 if (tg->rt_se)
118 destroy_rt_bandwidth(&tg->rt_bandwidth);
119
120 for_each_possible_cpu(i) {
121 if (tg->rt_rq)
122 kfree(tg->rt_rq[i]);
123 if (tg->rt_se)
124 kfree(tg->rt_se[i]);
125 }
126
127 kfree(tg->rt_rq);
128 kfree(tg->rt_se);
129}
130
131void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
132 struct sched_rt_entity *rt_se, int cpu,
133 struct sched_rt_entity *parent)
134{
135 struct rq *rq = cpu_rq(cpu);
136
137 rt_rq->highest_prio.curr = MAX_RT_PRIO;
138 rt_rq->rt_nr_boosted = 0;
139 rt_rq->rq = rq;
140 rt_rq->tg = tg;
141
142 tg->rt_rq[cpu] = rt_rq;
143 tg->rt_se[cpu] = rt_se;
144
145 if (!rt_se)
146 return;
147
148 if (!parent)
149 rt_se->rt_rq = &rq->rt;
150 else
151 rt_se->rt_rq = parent->my_q;
152
153 rt_se->my_q = rt_rq;
154 rt_se->parent = parent;
155 INIT_LIST_HEAD(&rt_se->run_list);
156}
157
158int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
159{
160 struct rt_rq *rt_rq;
161 struct sched_rt_entity *rt_se;
162 int i;
163
164 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
165 if (!tg->rt_rq)
166 goto err;
167 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
168 if (!tg->rt_se)
169 goto err;
170
171 init_rt_bandwidth(&tg->rt_bandwidth,
172 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
173
174 for_each_possible_cpu(i) {
175 rt_rq = kzalloc_node(sizeof(struct rt_rq),
176 GFP_KERNEL, cpu_to_node(i));
177 if (!rt_rq)
178 goto err;
179
180 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
181 GFP_KERNEL, cpu_to_node(i));
182 if (!rt_se)
183 goto err_free_rq;
184
185 init_rt_rq(rt_rq, cpu_rq(i));
186 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
187 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
188 }
189
190 return 1;
191
192err_free_rq:
193 kfree(rt_rq);
194err:
195 return 0;
196}
197
28#else /* CONFIG_RT_GROUP_SCHED */ 198#else /* CONFIG_RT_GROUP_SCHED */
29 199
30#define rt_entity_is_task(rt_se) (1) 200#define rt_entity_is_task(rt_se) (1)
@@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
47 return &rq->rt; 217 return &rq->rt;
48} 218}
49 219
220void free_rt_sched_group(struct task_group *tg) { }
221
222int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
223{
224 return 1;
225}
50#endif /* CONFIG_RT_GROUP_SCHED */ 226#endif /* CONFIG_RT_GROUP_SCHED */
51 227
52#ifdef CONFIG_SMP 228#ifdef CONFIG_SMP
@@ -556,6 +732,28 @@ static void enable_runtime(struct rq *rq)
556 raw_spin_unlock_irqrestore(&rq->lock, flags); 732 raw_spin_unlock_irqrestore(&rq->lock, flags);
557} 733}
558 734
735int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
736{
737 int cpu = (int)(long)hcpu;
738
739 switch (action) {
740 case CPU_DOWN_PREPARE:
741 case CPU_DOWN_PREPARE_FROZEN:
742 disable_runtime(cpu_rq(cpu));
743 return NOTIFY_OK;
744
745 case CPU_DOWN_FAILED:
746 case CPU_DOWN_FAILED_FROZEN:
747 case CPU_ONLINE:
748 case CPU_ONLINE_FROZEN:
749 enable_runtime(cpu_rq(cpu));
750 return NOTIFY_OK;
751
752 default:
753 return NOTIFY_DONE;
754 }
755}
756
559static int balance_runtime(struct rt_rq *rt_rq) 757static int balance_runtime(struct rt_rq *rt_rq)
560{ 758{
561 int more = 0; 759 int more = 0;
@@ -648,7 +846,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
648 if (rt_rq->rt_throttled) 846 if (rt_rq->rt_throttled)
649 return rt_rq_throttled(rt_rq); 847 return rt_rq_throttled(rt_rq);
650 848
651 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) 849 if (runtime >= sched_rt_period(rt_rq))
652 return 0; 850 return 0;
653 851
654 balance_runtime(rt_rq); 852 balance_runtime(rt_rq);
@@ -957,8 +1155,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
957} 1155}
958 1156
959/* 1157/*
960 * Put task to the end of the run list without the overhead of dequeue 1158 * Put task to the head or the end of the run list without the overhead of
961 * followed by enqueue. 1159 * dequeue followed by enqueue.
962 */ 1160 */
963static void 1161static void
964requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1162requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
@@ -1002,6 +1200,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1002 1200
1003 cpu = task_cpu(p); 1201 cpu = task_cpu(p);
1004 1202
1203 if (p->rt.nr_cpus_allowed == 1)
1204 goto out;
1205
1005 /* For anything but wake ups, just return the task_cpu */ 1206 /* For anything but wake ups, just return the task_cpu */
1006 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1207 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1007 goto out; 1208 goto out;
@@ -1178,8 +1379,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1178/* Only try algorithms three times */ 1379/* Only try algorithms three times */
1179#define RT_MAX_TRIES 3 1380#define RT_MAX_TRIES 3
1180 1381
1181static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1182
1183static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1382static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1184{ 1383{
1185 if (!task_running(rq, p) && 1384 if (!task_running(rq, p) &&
@@ -1653,13 +1852,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1653 pull_rt_task(rq); 1852 pull_rt_task(rq);
1654} 1853}
1655 1854
1656static inline void init_sched_rt_class(void) 1855void init_sched_rt_class(void)
1657{ 1856{
1658 unsigned int i; 1857 unsigned int i;
1659 1858
1660 for_each_possible_cpu(i) 1859 for_each_possible_cpu(i) {
1661 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 1860 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1662 GFP_KERNEL, cpu_to_node(i)); 1861 GFP_KERNEL, cpu_to_node(i));
1862 }
1663} 1863}
1664#endif /* CONFIG_SMP */ 1864#endif /* CONFIG_SMP */
1665 1865
@@ -1800,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1800 return 0; 2000 return 0;
1801} 2001}
1802 2002
1803static const struct sched_class rt_sched_class = { 2003const struct sched_class rt_sched_class = {
1804 .next = &fair_sched_class, 2004 .next = &fair_sched_class,
1805 .enqueue_task = enqueue_task_rt, 2005 .enqueue_task = enqueue_task_rt,
1806 .dequeue_task = dequeue_task_rt, 2006 .dequeue_task = dequeue_task_rt,
@@ -1835,7 +2035,7 @@ static const struct sched_class rt_sched_class = {
1835#ifdef CONFIG_SCHED_DEBUG 2035#ifdef CONFIG_SCHED_DEBUG
1836extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2036extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1837 2037
1838static void print_rt_stats(struct seq_file *m, int cpu) 2038void print_rt_stats(struct seq_file *m, int cpu)
1839{ 2039{
1840 rt_rq_iter_t iter; 2040 rt_rq_iter_t iter;
1841 struct rt_rq *rt_rq; 2041 struct rt_rq *rt_rq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
new file mode 100644
index 00000000000..d8d3613a405
--- /dev/null
+++ b/kernel/sched/sched.h
@@ -0,0 +1,1136 @@
1
2#include <linux/sched.h>
3#include <linux/mutex.h>
4#include <linux/spinlock.h>
5#include <linux/stop_machine.h>
6
7#include "cpupri.h"
8
9extern __read_mostly int scheduler_running;
10
11/*
12 * Convert user-nice values [ -20 ... 0 ... 19 ]
13 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
14 * and back.
15 */
16#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
17#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
18#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
19
20/*
21 * 'User priority' is the nice value converted to something we
22 * can work with better when scaling various scheduler parameters,
23 * it's a [ 0 ... 39 ] range.
24 */
25#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
26#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
27#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
28
29/*
30 * Helpers for converting nanosecond timing to jiffy resolution
31 */
32#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
33
34#define NICE_0_LOAD SCHED_LOAD_SCALE
35#define NICE_0_SHIFT SCHED_LOAD_SHIFT
36
37/*
38 * These are the 'tuning knobs' of the scheduler:
39 *
40 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
41 * Timeslices get refilled after they expire.
42 */
43#define DEF_TIMESLICE (100 * HZ / 1000)
44
45/*
46 * single value that denotes runtime == period, ie unlimited time.
47 */
48#define RUNTIME_INF ((u64)~0ULL)
49
50static inline int rt_policy(int policy)
51{
52 if (policy == SCHED_FIFO || policy == SCHED_RR)
53 return 1;
54 return 0;
55}
56
57static inline int task_has_rt_policy(struct task_struct *p)
58{
59 return rt_policy(p->policy);
60}
61
62/*
63 * This is the priority-queue data structure of the RT scheduling class:
64 */
65struct rt_prio_array {
66 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
67 struct list_head queue[MAX_RT_PRIO];
68};
69
70struct rt_bandwidth {
71 /* nests inside the rq lock: */
72 raw_spinlock_t rt_runtime_lock;
73 ktime_t rt_period;
74 u64 rt_runtime;
75 struct hrtimer rt_period_timer;
76};
77
78extern struct mutex sched_domains_mutex;
79
80#ifdef CONFIG_CGROUP_SCHED
81
82#include <linux/cgroup.h>
83
84struct cfs_rq;
85struct rt_rq;
86
87static LIST_HEAD(task_groups);
88
89struct cfs_bandwidth {
90#ifdef CONFIG_CFS_BANDWIDTH
91 raw_spinlock_t lock;
92 ktime_t period;
93 u64 quota, runtime;
94 s64 hierarchal_quota;
95 u64 runtime_expires;
96
97 int idle, timer_active;
98 struct hrtimer period_timer, slack_timer;
99 struct list_head throttled_cfs_rq;
100
101 /* statistics */
102 int nr_periods, nr_throttled;
103 u64 throttled_time;
104#endif
105};
106
107/* task group related information */
108struct task_group {
109 struct cgroup_subsys_state css;
110
111#ifdef CONFIG_FAIR_GROUP_SCHED
112 /* schedulable entities of this group on each cpu */
113 struct sched_entity **se;
114 /* runqueue "owned" by this group on each cpu */
115 struct cfs_rq **cfs_rq;
116 unsigned long shares;
117
118 atomic_t load_weight;
119#endif
120
121#ifdef CONFIG_RT_GROUP_SCHED
122 struct sched_rt_entity **rt_se;
123 struct rt_rq **rt_rq;
124
125 struct rt_bandwidth rt_bandwidth;
126#endif
127
128 struct rcu_head rcu;
129 struct list_head list;
130
131 struct task_group *parent;
132 struct list_head siblings;
133 struct list_head children;
134
135#ifdef CONFIG_SCHED_AUTOGROUP
136 struct autogroup *autogroup;
137#endif
138
139 struct cfs_bandwidth cfs_bandwidth;
140};
141
142#ifdef CONFIG_FAIR_GROUP_SCHED
143#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
144
145/*
146 * A weight of 0 or 1 can cause arithmetics problems.
147 * A weight of a cfs_rq is the sum of weights of which entities
148 * are queued on this cfs_rq, so a weight of a entity should not be
149 * too large, so as the shares value of a task group.
150 * (The default weight is 1024 - so there's no practical
151 * limitation from this.)
152 */
153#define MIN_SHARES (1UL << 1)
154#define MAX_SHARES (1UL << 18)
155#endif
156
157/* Default task group.
158 * Every task in system belong to this group at bootup.
159 */
160extern struct task_group root_task_group;
161
162typedef int (*tg_visitor)(struct task_group *, void *);
163
164extern int walk_tg_tree_from(struct task_group *from,
165 tg_visitor down, tg_visitor up, void *data);
166
167/*
168 * Iterate the full tree, calling @down when first entering a node and @up when
169 * leaving it for the final time.
170 *
171 * Caller must hold rcu_lock or sufficient equivalent.
172 */
173static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
174{
175 return walk_tg_tree_from(&root_task_group, down, up, data);
176}
177
178extern int tg_nop(struct task_group *tg, void *data);
179
180extern void free_fair_sched_group(struct task_group *tg);
181extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
182extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
183extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
184 struct sched_entity *se, int cpu,
185 struct sched_entity *parent);
186extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
187extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
188
189extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
190extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
191extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
192
193extern void free_rt_sched_group(struct task_group *tg);
194extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
195extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
196 struct sched_rt_entity *rt_se, int cpu,
197 struct sched_rt_entity *parent);
198
199#else /* CONFIG_CGROUP_SCHED */
200
201struct cfs_bandwidth { };
202
203#endif /* CONFIG_CGROUP_SCHED */
204
205/* CFS-related fields in a runqueue */
206struct cfs_rq {
207 struct load_weight load;
208 unsigned long nr_running, h_nr_running;
209
210 u64 exec_clock;
211 u64 min_vruntime;
212#ifndef CONFIG_64BIT
213 u64 min_vruntime_copy;
214#endif
215
216 struct rb_root tasks_timeline;
217 struct rb_node *rb_leftmost;
218
219 struct list_head tasks;
220 struct list_head *balance_iterator;
221
222 /*
223 * 'curr' points to currently running entity on this cfs_rq.
224 * It is set to NULL otherwise (i.e when none are currently running).
225 */
226 struct sched_entity *curr, *next, *last, *skip;
227
228#ifdef CONFIG_SCHED_DEBUG
229 unsigned int nr_spread_over;
230#endif
231
232#ifdef CONFIG_FAIR_GROUP_SCHED
233 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
234
235 /*
236 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
237 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
238 * (like users, containers etc.)
239 *
240 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
241 * list is used during load balance.
242 */
243 int on_list;
244 struct list_head leaf_cfs_rq_list;
245 struct task_group *tg; /* group that "owns" this runqueue */
246
247#ifdef CONFIG_SMP
248 /*
249 * the part of load.weight contributed by tasks
250 */
251 unsigned long task_weight;
252
253 /*
254 * h_load = weight * f(tg)
255 *
256 * Where f(tg) is the recursive weight fraction assigned to
257 * this group.
258 */
259 unsigned long h_load;
260
261 /*
262 * Maintaining per-cpu shares distribution for group scheduling
263 *
264 * load_stamp is the last time we updated the load average
265 * load_last is the last time we updated the load average and saw load
266 * load_unacc_exec_time is currently unaccounted execution time
267 */
268 u64 load_avg;
269 u64 load_period;
270 u64 load_stamp, load_last, load_unacc_exec_time;
271
272 unsigned long load_contribution;
273#endif /* CONFIG_SMP */
274#ifdef CONFIG_CFS_BANDWIDTH
275 int runtime_enabled;
276 u64 runtime_expires;
277 s64 runtime_remaining;
278
279 u64 throttled_timestamp;
280 int throttled, throttle_count;
281 struct list_head throttled_list;
282#endif /* CONFIG_CFS_BANDWIDTH */
283#endif /* CONFIG_FAIR_GROUP_SCHED */
284};
285
286static inline int rt_bandwidth_enabled(void)
287{
288 return sysctl_sched_rt_runtime >= 0;
289}
290
291/* Real-Time classes' related field in a runqueue: */
292struct rt_rq {
293 struct rt_prio_array active;
294 unsigned long rt_nr_running;
295#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
296 struct {
297 int curr; /* highest queued rt task prio */
298#ifdef CONFIG_SMP
299 int next; /* next highest */
300#endif
301 } highest_prio;
302#endif
303#ifdef CONFIG_SMP
304 unsigned long rt_nr_migratory;
305 unsigned long rt_nr_total;
306 int overloaded;
307 struct plist_head pushable_tasks;
308#endif
309 int rt_throttled;
310 u64 rt_time;
311 u64 rt_runtime;
312 /* Nests inside the rq lock: */
313 raw_spinlock_t rt_runtime_lock;
314
315#ifdef CONFIG_RT_GROUP_SCHED
316 unsigned long rt_nr_boosted;
317
318 struct rq *rq;
319 struct list_head leaf_rt_rq_list;
320 struct task_group *tg;
321#endif
322};
323
324#ifdef CONFIG_SMP
325
326/*
327 * We add the notion of a root-domain which will be used to define per-domain
328 * variables. Each exclusive cpuset essentially defines an island domain by
329 * fully partitioning the member cpus from any other cpuset. Whenever a new
330 * exclusive cpuset is created, we also create and attach a new root-domain
331 * object.
332 *
333 */
334struct root_domain {
335 atomic_t refcount;
336 atomic_t rto_count;
337 struct rcu_head rcu;
338 cpumask_var_t span;
339 cpumask_var_t online;
340
341 /*
342 * The "RT overload" flag: it gets set if a CPU has more than
343 * one runnable RT task.
344 */
345 cpumask_var_t rto_mask;
346 struct cpupri cpupri;
347};
348
349extern struct root_domain def_root_domain;
350
351#endif /* CONFIG_SMP */
352
353/*
354 * This is the main, per-CPU runqueue data structure.
355 *
356 * Locking rule: those places that want to lock multiple runqueues
357 * (such as the load balancing or the thread migration code), lock
358 * acquire operations must be ordered by ascending &runqueue.
359 */
360struct rq {
361 /* runqueue lock: */
362 raw_spinlock_t lock;
363
364 /*
365 * nr_running and cpu_load should be in the same cacheline because
366 * remote CPUs use both these fields when doing load calculation.
367 */
368 unsigned long nr_running;
369 #define CPU_LOAD_IDX_MAX 5
370 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
371 unsigned long last_load_update_tick;
372#ifdef CONFIG_NO_HZ
373 u64 nohz_stamp;
374 unsigned long nohz_flags;
375#endif
376 int skip_clock_update;
377
378 /* capture load from *all* tasks on this cpu: */
379 struct load_weight load;
380 unsigned long nr_load_updates;
381 u64 nr_switches;
382
383 struct cfs_rq cfs;
384 struct rt_rq rt;
385
386#ifdef CONFIG_FAIR_GROUP_SCHED
387 /* list of leaf cfs_rq on this cpu: */
388 struct list_head leaf_cfs_rq_list;
389#endif
390#ifdef CONFIG_RT_GROUP_SCHED
391 struct list_head leaf_rt_rq_list;
392#endif
393
394 /*
395 * This is part of a global counter where only the total sum
396 * over all CPUs matters. A task can increase this counter on
397 * one CPU and if it got migrated afterwards it may decrease
398 * it on another CPU. Always updated under the runqueue lock:
399 */
400 unsigned long nr_uninterruptible;
401
402 struct task_struct *curr, *idle, *stop;
403 unsigned long next_balance;
404 struct mm_struct *prev_mm;
405
406 u64 clock;
407 u64 clock_task;
408
409 atomic_t nr_iowait;
410
411#ifdef CONFIG_SMP
412 struct root_domain *rd;
413 struct sched_domain *sd;
414
415 unsigned long cpu_power;
416
417 unsigned char idle_balance;
418 /* For active balancing */
419 int post_schedule;
420 int active_balance;
421 int push_cpu;
422 struct cpu_stop_work active_balance_work;
423 /* cpu of this runqueue: */
424 int cpu;
425 int online;
426
427 u64 rt_avg;
428 u64 age_stamp;
429 u64 idle_stamp;
430 u64 avg_idle;
431#endif
432
433#ifdef CONFIG_IRQ_TIME_ACCOUNTING
434 u64 prev_irq_time;
435#endif
436#ifdef CONFIG_PARAVIRT
437 u64 prev_steal_time;
438#endif
439#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
440 u64 prev_steal_time_rq;
441#endif
442
443 /* calc_load related fields */
444 unsigned long calc_load_update;
445 long calc_load_active;
446
447#ifdef CONFIG_SCHED_HRTICK
448#ifdef CONFIG_SMP
449 int hrtick_csd_pending;
450 struct call_single_data hrtick_csd;
451#endif
452 struct hrtimer hrtick_timer;
453#endif
454
455#ifdef CONFIG_SCHEDSTATS
456 /* latency stats */
457 struct sched_info rq_sched_info;
458 unsigned long long rq_cpu_time;
459 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
460
461 /* sys_sched_yield() stats */
462 unsigned int yld_count;
463
464 /* schedule() stats */
465 unsigned int sched_switch;
466 unsigned int sched_count;
467 unsigned int sched_goidle;
468
469 /* try_to_wake_up() stats */
470 unsigned int ttwu_count;
471 unsigned int ttwu_local;
472#endif
473
474#ifdef CONFIG_SMP
475 struct llist_head wake_list;
476#endif
477};
478
479static inline int cpu_of(struct rq *rq)
480{
481#ifdef CONFIG_SMP
482 return rq->cpu;
483#else
484 return 0;
485#endif
486}
487
488DECLARE_PER_CPU(struct rq, runqueues);
489
490#define rcu_dereference_check_sched_domain(p) \
491 rcu_dereference_check((p), \
492 lockdep_is_held(&sched_domains_mutex))
493
494/*
495 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
496 * See detach_destroy_domains: synchronize_sched for details.
497 *
498 * The domain tree of any CPU may only be accessed from within
499 * preempt-disabled sections.
500 */
501#define for_each_domain(cpu, __sd) \
502 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
503
504#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
505
506#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
507#define this_rq() (&__get_cpu_var(runqueues))
508#define task_rq(p) cpu_rq(task_cpu(p))
509#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
510#define raw_rq() (&__raw_get_cpu_var(runqueues))
511
512#include "stats.h"
513#include "auto_group.h"
514
515#ifdef CONFIG_CGROUP_SCHED
516
517/*
518 * Return the group to which this tasks belongs.
519 *
520 * We use task_subsys_state_check() and extend the RCU verification with
521 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
522 * task it moves into the cgroup. Therefore by holding either of those locks,
523 * we pin the task to the current cgroup.
524 */
525static inline struct task_group *task_group(struct task_struct *p)
526{
527 struct task_group *tg;
528 struct cgroup_subsys_state *css;
529
530 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
531 lockdep_is_held(&p->pi_lock) ||
532 lockdep_is_held(&task_rq(p)->lock));
533 tg = container_of(css, struct task_group, css);
534
535 return autogroup_task_group(p, tg);
536}
537
538/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
539static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
540{
541#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
542 struct task_group *tg = task_group(p);
543#endif
544
545#ifdef CONFIG_FAIR_GROUP_SCHED
546 p->se.cfs_rq = tg->cfs_rq[cpu];
547 p->se.parent = tg->se[cpu];
548#endif
549
550#ifdef CONFIG_RT_GROUP_SCHED
551 p->rt.rt_rq = tg->rt_rq[cpu];
552 p->rt.parent = tg->rt_se[cpu];
553#endif
554}
555
556#else /* CONFIG_CGROUP_SCHED */
557
558static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
559static inline struct task_group *task_group(struct task_struct *p)
560{
561 return NULL;
562}
563
564#endif /* CONFIG_CGROUP_SCHED */
565
566static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
567{
568 set_task_rq(p, cpu);
569#ifdef CONFIG_SMP
570 /*
571 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
572 * successfuly executed on another CPU. We must ensure that updates of
573 * per-task data have been completed by this moment.
574 */
575 smp_wmb();
576 task_thread_info(p)->cpu = cpu;
577#endif
578}
579
580/*
581 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
582 */
583#ifdef CONFIG_SCHED_DEBUG
584# include <linux/jump_label.h>
585# define const_debug __read_mostly
586#else
587# define const_debug const
588#endif
589
590extern const_debug unsigned int sysctl_sched_features;
591
592#define SCHED_FEAT(name, enabled) \
593 __SCHED_FEAT_##name ,
594
595enum {
596#include "features.h"
597 __SCHED_FEAT_NR,
598};
599
600#undef SCHED_FEAT
601
602#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
603static __always_inline bool static_branch__true(struct jump_label_key *key)
604{
605 return likely(static_branch(key)); /* Not out of line branch. */
606}
607
608static __always_inline bool static_branch__false(struct jump_label_key *key)
609{
610 return unlikely(static_branch(key)); /* Out of line branch. */
611}
612
613#define SCHED_FEAT(name, enabled) \
614static __always_inline bool static_branch_##name(struct jump_label_key *key) \
615{ \
616 return static_branch__##enabled(key); \
617}
618
619#include "features.h"
620
621#undef SCHED_FEAT
622
623extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR];
624#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
625#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
626#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
627#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
628
629static inline u64 global_rt_period(void)
630{
631 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
632}
633
634static inline u64 global_rt_runtime(void)
635{
636 if (sysctl_sched_rt_runtime < 0)
637 return RUNTIME_INF;
638
639 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
640}
641
642
643
644static inline int task_current(struct rq *rq, struct task_struct *p)
645{
646 return rq->curr == p;
647}
648
649static inline int task_running(struct rq *rq, struct task_struct *p)
650{
651#ifdef CONFIG_SMP
652 return p->on_cpu;
653#else
654 return task_current(rq, p);
655#endif
656}
657
658
659#ifndef prepare_arch_switch
660# define prepare_arch_switch(next) do { } while (0)
661#endif
662#ifndef finish_arch_switch
663# define finish_arch_switch(prev) do { } while (0)
664#endif
665
666#ifndef __ARCH_WANT_UNLOCKED_CTXSW
667static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
668{
669#ifdef CONFIG_SMP
670 /*
671 * We can optimise this out completely for !SMP, because the
672 * SMP rebalancing from interrupt is the only thing that cares
673 * here.
674 */
675 next->on_cpu = 1;
676#endif
677}
678
679static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
680{
681#ifdef CONFIG_SMP
682 /*
683 * After ->on_cpu is cleared, the task can be moved to a different CPU.
684 * We must ensure this doesn't happen until the switch is completely
685 * finished.
686 */
687 smp_wmb();
688 prev->on_cpu = 0;
689#endif
690#ifdef CONFIG_DEBUG_SPINLOCK
691 /* this is a valid case when another task releases the spinlock */
692 rq->lock.owner = current;
693#endif
694 /*
695 * If we are tracking spinlock dependencies then we have to
696 * fix up the runqueue lock - which gets 'carried over' from
697 * prev into current:
698 */
699 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
700
701 raw_spin_unlock_irq(&rq->lock);
702}
703
704#else /* __ARCH_WANT_UNLOCKED_CTXSW */
705static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
706{
707#ifdef CONFIG_SMP
708 /*
709 * We can optimise this out completely for !SMP, because the
710 * SMP rebalancing from interrupt is the only thing that cares
711 * here.
712 */
713 next->on_cpu = 1;
714#endif
715#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
716 raw_spin_unlock_irq(&rq->lock);
717#else
718 raw_spin_unlock(&rq->lock);
719#endif
720}
721
722static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
723{
724#ifdef CONFIG_SMP
725 /*
726 * After ->on_cpu is cleared, the task can be moved to a different CPU.
727 * We must ensure this doesn't happen until the switch is completely
728 * finished.
729 */
730 smp_wmb();
731 prev->on_cpu = 0;
732#endif
733#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
734 local_irq_enable();
735#endif
736}
737#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
738
739
740static inline void update_load_add(struct load_weight *lw, unsigned long inc)
741{
742 lw->weight += inc;
743 lw->inv_weight = 0;
744}
745
746static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
747{
748 lw->weight -= dec;
749 lw->inv_weight = 0;
750}
751
752static inline void update_load_set(struct load_weight *lw, unsigned long w)
753{
754 lw->weight = w;
755 lw->inv_weight = 0;
756}
757
758/*
759 * To aid in avoiding the subversion of "niceness" due to uneven distribution
760 * of tasks with abnormal "nice" values across CPUs the contribution that
761 * each task makes to its run queue's load is weighted according to its
762 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
763 * scaled version of the new time slice allocation that they receive on time
764 * slice expiry etc.
765 */
766
767#define WEIGHT_IDLEPRIO 3
768#define WMULT_IDLEPRIO 1431655765
769
770/*
771 * Nice levels are multiplicative, with a gentle 10% change for every
772 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
773 * nice 1, it will get ~10% less CPU time than another CPU-bound task
774 * that remained on nice 0.
775 *
776 * The "10% effect" is relative and cumulative: from _any_ nice level,
777 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
778 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
779 * If a task goes up by ~10% and another task goes down by ~10% then
780 * the relative distance between them is ~25%.)
781 */
782static const int prio_to_weight[40] = {
783 /* -20 */ 88761, 71755, 56483, 46273, 36291,
784 /* -15 */ 29154, 23254, 18705, 14949, 11916,
785 /* -10 */ 9548, 7620, 6100, 4904, 3906,
786 /* -5 */ 3121, 2501, 1991, 1586, 1277,
787 /* 0 */ 1024, 820, 655, 526, 423,
788 /* 5 */ 335, 272, 215, 172, 137,
789 /* 10 */ 110, 87, 70, 56, 45,
790 /* 15 */ 36, 29, 23, 18, 15,
791};
792
793/*
794 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
795 *
796 * In cases where the weight does not change often, we can use the
797 * precalculated inverse to speed up arithmetics by turning divisions
798 * into multiplications:
799 */
800static const u32 prio_to_wmult[40] = {
801 /* -20 */ 48388, 59856, 76040, 92818, 118348,
802 /* -15 */ 147320, 184698, 229616, 287308, 360437,
803 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
804 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
805 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
806 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
807 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
808 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
809};
810
811/* Time spent by the tasks of the cpu accounting group executing in ... */
812enum cpuacct_stat_index {
813 CPUACCT_STAT_USER, /* ... user mode */
814 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
815
816 CPUACCT_STAT_NSTATS,
817};
818
819
820#define sched_class_highest (&stop_sched_class)
821#define for_each_class(class) \
822 for (class = sched_class_highest; class; class = class->next)
823
824extern const struct sched_class stop_sched_class;
825extern const struct sched_class rt_sched_class;
826extern const struct sched_class fair_sched_class;
827extern const struct sched_class idle_sched_class;
828
829
830#ifdef CONFIG_SMP
831
832extern void trigger_load_balance(struct rq *rq, int cpu);
833extern void idle_balance(int this_cpu, struct rq *this_rq);
834
835#else /* CONFIG_SMP */
836
837static inline void idle_balance(int cpu, struct rq *rq)
838{
839}
840
841#endif
842
843extern void sysrq_sched_debug_show(void);
844extern void sched_init_granularity(void);
845extern void update_max_interval(void);
846extern void update_group_power(struct sched_domain *sd, int cpu);
847extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
848extern void init_sched_rt_class(void);
849extern void init_sched_fair_class(void);
850
851extern void resched_task(struct task_struct *p);
852extern void resched_cpu(int cpu);
853
854extern struct rt_bandwidth def_rt_bandwidth;
855extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
856
857extern void update_cpu_load(struct rq *this_rq);
858
859#ifdef CONFIG_CGROUP_CPUACCT
860#include <linux/cgroup.h>
861/* track cpu usage of a group of tasks and its child groups */
862struct cpuacct {
863 struct cgroup_subsys_state css;
864 /* cpuusage holds pointer to a u64-type object on every cpu */
865 u64 __percpu *cpuusage;
866 struct kernel_cpustat __percpu *cpustat;
867};
868
869/* return cpu accounting group corresponding to this container */
870static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
871{
872 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
873 struct cpuacct, css);
874}
875
876/* return cpu accounting group to which this task belongs */
877static inline struct cpuacct *task_ca(struct task_struct *tsk)
878{
879 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
880 struct cpuacct, css);
881}
882
883static inline struct cpuacct *parent_ca(struct cpuacct *ca)
884{
885 if (!ca || !ca->css.cgroup->parent)
886 return NULL;
887 return cgroup_ca(ca->css.cgroup->parent);
888}
889
890extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
891#else
892static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
893#endif
894
895static inline void inc_nr_running(struct rq *rq)
896{
897 rq->nr_running++;
898}
899
900static inline void dec_nr_running(struct rq *rq)
901{
902 rq->nr_running--;
903}
904
905extern void update_rq_clock(struct rq *rq);
906
907extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
908extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
909
910extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
911
912extern const_debug unsigned int sysctl_sched_time_avg;
913extern const_debug unsigned int sysctl_sched_nr_migrate;
914extern const_debug unsigned int sysctl_sched_migration_cost;
915
916static inline u64 sched_avg_period(void)
917{
918 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
919}
920
921void calc_load_account_idle(struct rq *this_rq);
922
923#ifdef CONFIG_SCHED_HRTICK
924
925/*
926 * Use hrtick when:
927 * - enabled by features
928 * - hrtimer is actually high res
929 */
930static inline int hrtick_enabled(struct rq *rq)
931{
932 if (!sched_feat(HRTICK))
933 return 0;
934 if (!cpu_active(cpu_of(rq)))
935 return 0;
936 return hrtimer_is_hres_active(&rq->hrtick_timer);
937}
938
939void hrtick_start(struct rq *rq, u64 delay);
940
941#else
942
943static inline int hrtick_enabled(struct rq *rq)
944{
945 return 0;
946}
947
948#endif /* CONFIG_SCHED_HRTICK */
949
950#ifdef CONFIG_SMP
951extern void sched_avg_update(struct rq *rq);
952static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
953{
954 rq->rt_avg += rt_delta;
955 sched_avg_update(rq);
956}
957#else
958static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
959static inline void sched_avg_update(struct rq *rq) { }
960#endif
961
962extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
963
964#ifdef CONFIG_SMP
965#ifdef CONFIG_PREEMPT
966
967static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
968
969/*
970 * fair double_lock_balance: Safely acquires both rq->locks in a fair
971 * way at the expense of forcing extra atomic operations in all
972 * invocations. This assures that the double_lock is acquired using the
973 * same underlying policy as the spinlock_t on this architecture, which
974 * reduces latency compared to the unfair variant below. However, it
975 * also adds more overhead and therefore may reduce throughput.
976 */
977static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
978 __releases(this_rq->lock)
979 __acquires(busiest->lock)
980 __acquires(this_rq->lock)
981{
982 raw_spin_unlock(&this_rq->lock);
983 double_rq_lock(this_rq, busiest);
984
985 return 1;
986}
987
988#else
989/*
990 * Unfair double_lock_balance: Optimizes throughput at the expense of
991 * latency by eliminating extra atomic operations when the locks are
992 * already in proper order on entry. This favors lower cpu-ids and will
993 * grant the double lock to lower cpus over higher ids under contention,
994 * regardless of entry order into the function.
995 */
996static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
997 __releases(this_rq->lock)
998 __acquires(busiest->lock)
999 __acquires(this_rq->lock)
1000{
1001 int ret = 0;
1002
1003 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1004 if (busiest < this_rq) {
1005 raw_spin_unlock(&this_rq->lock);
1006 raw_spin_lock(&busiest->lock);
1007 raw_spin_lock_nested(&this_rq->lock,
1008 SINGLE_DEPTH_NESTING);
1009 ret = 1;
1010 } else
1011 raw_spin_lock_nested(&busiest->lock,
1012 SINGLE_DEPTH_NESTING);
1013 }
1014 return ret;
1015}
1016
1017#endif /* CONFIG_PREEMPT */
1018
1019/*
1020 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1021 */
1022static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1023{
1024 if (unlikely(!irqs_disabled())) {
1025 /* printk() doesn't work good under rq->lock */
1026 raw_spin_unlock(&this_rq->lock);
1027 BUG_ON(1);
1028 }
1029
1030 return _double_lock_balance(this_rq, busiest);
1031}
1032
1033static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1034 __releases(busiest->lock)
1035{
1036 raw_spin_unlock(&busiest->lock);
1037 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1038}
1039
1040/*
1041 * double_rq_lock - safely lock two runqueues
1042 *
1043 * Note this does not disable interrupts like task_rq_lock,
1044 * you need to do so manually before calling.
1045 */
1046static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1047 __acquires(rq1->lock)
1048 __acquires(rq2->lock)
1049{
1050 BUG_ON(!irqs_disabled());
1051 if (rq1 == rq2) {
1052 raw_spin_lock(&rq1->lock);
1053 __acquire(rq2->lock); /* Fake it out ;) */
1054 } else {
1055 if (rq1 < rq2) {
1056 raw_spin_lock(&rq1->lock);
1057 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1058 } else {
1059 raw_spin_lock(&rq2->lock);
1060 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1061 }
1062 }
1063}
1064
1065/*
1066 * double_rq_unlock - safely unlock two runqueues
1067 *
1068 * Note this does not restore interrupts like task_rq_unlock,
1069 * you need to do so manually after calling.
1070 */
1071static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1072 __releases(rq1->lock)
1073 __releases(rq2->lock)
1074{
1075 raw_spin_unlock(&rq1->lock);
1076 if (rq1 != rq2)
1077 raw_spin_unlock(&rq2->lock);
1078 else
1079 __release(rq2->lock);
1080}
1081
1082#else /* CONFIG_SMP */
1083
1084/*
1085 * double_rq_lock - safely lock two runqueues
1086 *
1087 * Note this does not disable interrupts like task_rq_lock,
1088 * you need to do so manually before calling.
1089 */
1090static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1091 __acquires(rq1->lock)
1092 __acquires(rq2->lock)
1093{
1094 BUG_ON(!irqs_disabled());
1095 BUG_ON(rq1 != rq2);
1096 raw_spin_lock(&rq1->lock);
1097 __acquire(rq2->lock); /* Fake it out ;) */
1098}
1099
1100/*
1101 * double_rq_unlock - safely unlock two runqueues
1102 *
1103 * Note this does not restore interrupts like task_rq_unlock,
1104 * you need to do so manually after calling.
1105 */
1106static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1107 __releases(rq1->lock)
1108 __releases(rq2->lock)
1109{
1110 BUG_ON(rq1 != rq2);
1111 raw_spin_unlock(&rq1->lock);
1112 __release(rq2->lock);
1113}
1114
1115#endif
1116
1117extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1118extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1119extern void print_cfs_stats(struct seq_file *m, int cpu);
1120extern void print_rt_stats(struct seq_file *m, int cpu);
1121
1122extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1123extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1124extern void unthrottle_offline_cfs_rqs(struct rq *rq);
1125
1126extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1127
1128#ifdef CONFIG_NO_HZ
1129enum rq_nohz_flag_bits {
1130 NOHZ_TICK_STOPPED,
1131 NOHZ_BALANCE_KICK,
1132 NOHZ_IDLE,
1133};
1134
1135#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
1136#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
new file mode 100644
index 00000000000..2a581ba8e19
--- /dev/null
+++ b/kernel/sched/stats.c
@@ -0,0 +1,111 @@
1
2#include <linux/slab.h>
3#include <linux/fs.h>
4#include <linux/seq_file.h>
5#include <linux/proc_fs.h>
6
7#include "sched.h"
8
9/*
10 * bump this up when changing the output format or the meaning of an existing
11 * format, so that tools can adapt (or abort)
12 */
13#define SCHEDSTAT_VERSION 15
14
15static int show_schedstat(struct seq_file *seq, void *v)
16{
17 int cpu;
18 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
19 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
20
21 if (mask_str == NULL)
22 return -ENOMEM;
23
24 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
25 seq_printf(seq, "timestamp %lu\n", jiffies);
26 for_each_online_cpu(cpu) {
27 struct rq *rq = cpu_rq(cpu);
28#ifdef CONFIG_SMP
29 struct sched_domain *sd;
30 int dcount = 0;
31#endif
32
33 /* runqueue-specific stats */
34 seq_printf(seq,
35 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
36 cpu, rq->yld_count,
37 rq->sched_switch, rq->sched_count, rq->sched_goidle,
38 rq->ttwu_count, rq->ttwu_local,
39 rq->rq_cpu_time,
40 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
41
42 seq_printf(seq, "\n");
43
44#ifdef CONFIG_SMP
45 /* domain-specific stats */
46 rcu_read_lock();
47 for_each_domain(cpu, sd) {
48 enum cpu_idle_type itype;
49
50 cpumask_scnprintf(mask_str, mask_len,
51 sched_domain_span(sd));
52 seq_printf(seq, "domain%d %s", dcount++, mask_str);
53 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
54 itype++) {
55 seq_printf(seq, " %u %u %u %u %u %u %u %u",
56 sd->lb_count[itype],
57 sd->lb_balanced[itype],
58 sd->lb_failed[itype],
59 sd->lb_imbalance[itype],
60 sd->lb_gained[itype],
61 sd->lb_hot_gained[itype],
62 sd->lb_nobusyq[itype],
63 sd->lb_nobusyg[itype]);
64 }
65 seq_printf(seq,
66 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
67 sd->alb_count, sd->alb_failed, sd->alb_pushed,
68 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
69 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
70 sd->ttwu_wake_remote, sd->ttwu_move_affine,
71 sd->ttwu_move_balance);
72 }
73 rcu_read_unlock();
74#endif
75 }
76 kfree(mask_str);
77 return 0;
78}
79
80static int schedstat_open(struct inode *inode, struct file *file)
81{
82 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
83 char *buf = kmalloc(size, GFP_KERNEL);
84 struct seq_file *m;
85 int res;
86
87 if (!buf)
88 return -ENOMEM;
89 res = single_open(file, show_schedstat, NULL);
90 if (!res) {
91 m = file->private_data;
92 m->buf = buf;
93 m->size = size;
94 } else
95 kfree(buf);
96 return res;
97}
98
99static const struct file_operations proc_schedstat_operations = {
100 .open = schedstat_open,
101 .read = seq_read,
102 .llseek = seq_lseek,
103 .release = single_release,
104};
105
106static int __init proc_schedstat_init(void)
107{
108 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
109 return 0;
110}
111module_init(proc_schedstat_init);
diff --git a/kernel/sched_stats.h b/kernel/sched/stats.h
index 4b71dbef271..2ef90a51ec5 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched/stats.h
@@ -1,108 +1,5 @@
1 1
2#ifdef CONFIG_SCHEDSTATS 2#ifdef CONFIG_SCHEDSTATS
3/*
4 * bump this up when changing the output format or the meaning of an existing
5 * format, so that tools can adapt (or abort)
6 */
7#define SCHEDSTAT_VERSION 15
8
9static int show_schedstat(struct seq_file *seq, void *v)
10{
11 int cpu;
12 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
13 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
14
15 if (mask_str == NULL)
16 return -ENOMEM;
17
18 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
19 seq_printf(seq, "timestamp %lu\n", jiffies);
20 for_each_online_cpu(cpu) {
21 struct rq *rq = cpu_rq(cpu);
22#ifdef CONFIG_SMP
23 struct sched_domain *sd;
24 int dcount = 0;
25#endif
26
27 /* runqueue-specific stats */
28 seq_printf(seq,
29 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
30 cpu, rq->yld_count,
31 rq->sched_switch, rq->sched_count, rq->sched_goidle,
32 rq->ttwu_count, rq->ttwu_local,
33 rq->rq_cpu_time,
34 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
35
36 seq_printf(seq, "\n");
37
38#ifdef CONFIG_SMP
39 /* domain-specific stats */
40 rcu_read_lock();
41 for_each_domain(cpu, sd) {
42 enum cpu_idle_type itype;
43
44 cpumask_scnprintf(mask_str, mask_len,
45 sched_domain_span(sd));
46 seq_printf(seq, "domain%d %s", dcount++, mask_str);
47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
48 itype++) {
49 seq_printf(seq, " %u %u %u %u %u %u %u %u",
50 sd->lb_count[itype],
51 sd->lb_balanced[itype],
52 sd->lb_failed[itype],
53 sd->lb_imbalance[itype],
54 sd->lb_gained[itype],
55 sd->lb_hot_gained[itype],
56 sd->lb_nobusyq[itype],
57 sd->lb_nobusyg[itype]);
58 }
59 seq_printf(seq,
60 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
61 sd->alb_count, sd->alb_failed, sd->alb_pushed,
62 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
63 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
64 sd->ttwu_wake_remote, sd->ttwu_move_affine,
65 sd->ttwu_move_balance);
66 }
67 rcu_read_unlock();
68#endif
69 }
70 kfree(mask_str);
71 return 0;
72}
73
74static int schedstat_open(struct inode *inode, struct file *file)
75{
76 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
77 char *buf = kmalloc(size, GFP_KERNEL);
78 struct seq_file *m;
79 int res;
80
81 if (!buf)
82 return -ENOMEM;
83 res = single_open(file, show_schedstat, NULL);
84 if (!res) {
85 m = file->private_data;
86 m->buf = buf;
87 m->size = size;
88 } else
89 kfree(buf);
90 return res;
91}
92
93static const struct file_operations proc_schedstat_operations = {
94 .open = schedstat_open,
95 .read = seq_read,
96 .llseek = seq_lseek,
97 .release = single_release,
98};
99
100static int __init proc_schedstat_init(void)
101{
102 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
103 return 0;
104}
105module_init(proc_schedstat_init);
106 3
107/* 4/*
108 * Expects runqueue lock to be held for atomicity of update 5 * Expects runqueue lock to be held for atomicity of update
diff --git a/kernel/sched_stoptask.c b/kernel/sched/stop_task.c
index 8b44e7fa7fb..7b386e86fd2 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched/stop_task.c
@@ -1,3 +1,5 @@
1#include "sched.h"
2
1/* 3/*
2 * stop-task scheduling class. 4 * stop-task scheduling class.
3 * 5 *
@@ -80,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
80/* 82/*
81 * Simple, special scheduling class for the per-CPU stop tasks: 83 * Simple, special scheduling class for the per-CPU stop tasks:
82 */ 84 */
83static const struct sched_class stop_sched_class = { 85const struct sched_class stop_sched_class = {
84 .next = &rt_sched_class, 86 .next = &rt_sched_class,
85 87
86 .enqueue_task = enqueue_task_stop, 88 .enqueue_task = enqueue_task_stop,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 40420644d0b..31cc06163ed 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -297,6 +297,15 @@ void tick_nohz_stop_sched_tick(int inidle)
297 ts = &per_cpu(tick_cpu_sched, cpu); 297 ts = &per_cpu(tick_cpu_sched, cpu);
298 298
299 /* 299 /*
300 * Update the idle state in the scheduler domain hierarchy
301 * when tick_nohz_stop_sched_tick() is called from the idle loop.
302 * State will be updated to busy during the first busy tick after
303 * exiting idle.
304 */
305 if (inidle)
306 set_cpu_sd_state_idle();
307
308 /*
300 * Call to tick_nohz_start_idle stops the last_update_time from being 309 * Call to tick_nohz_start_idle stops the last_update_time from being
301 * updated. Thus, it must not be called in the event we are called from 310 * updated. Thus, it must not be called in the event we are called from
302 * irq_exit() with the prior state different than idle. 311 * irq_exit() with the prior state different than idle.