diff options
| -rw-r--r-- | arch/s390/appldata/appldata_os.c | 16 | ||||
| -rw-r--r-- | arch/x86/include/asm/i387.h | 2 | ||||
| -rw-r--r-- | drivers/cpufreq/cpufreq_conservative.c | 41 | ||||
| -rw-r--r-- | drivers/cpufreq/cpufreq_ondemand.c | 41 | ||||
| -rw-r--r-- | drivers/macintosh/rack-meter.c | 7 | ||||
| -rw-r--r-- | fs/proc/stat.c | 52 | ||||
| -rw-r--r-- | fs/proc/uptime.c | 4 | ||||
| -rw-r--r-- | include/linux/kernel_stat.h | 36 | ||||
| -rw-r--r-- | include/linux/latencytop.h | 3 | ||||
| -rw-r--r-- | include/linux/sched.h | 19 | ||||
| -rw-r--r-- | include/trace/events/sched.h | 7 | ||||
| -rw-r--r-- | kernel/Makefile | 20 | ||||
| -rw-r--r-- | kernel/sched/Makefile | 20 | ||||
| -rw-r--r-- | kernel/sched/auto_group.c (renamed from kernel/sched_autogroup.c) | 33 | ||||
| -rw-r--r-- | kernel/sched/auto_group.h (renamed from kernel/sched_autogroup.h) | 26 | ||||
| -rw-r--r-- | kernel/sched/clock.c (renamed from kernel/sched_clock.c) | 0 | ||||
| -rw-r--r-- | kernel/sched/core.c (renamed from kernel/sched.c) | 2098 | ||||
| -rw-r--r-- | kernel/sched/cpupri.c (renamed from kernel/sched_cpupri.c) | 4 | ||||
| -rw-r--r-- | kernel/sched/cpupri.h (renamed from kernel/sched_cpupri.h) | 0 | ||||
| -rw-r--r-- | kernel/sched/debug.c (renamed from kernel/sched_debug.c) | 6 | ||||
| -rw-r--r-- | kernel/sched/fair.c (renamed from kernel/sched_fair.c) | 929 | ||||
| -rw-r--r-- | kernel/sched/features.h (renamed from kernel/sched_features.h) | 30 | ||||
| -rw-r--r-- | kernel/sched/idle_task.c (renamed from kernel/sched_idletask.c) | 4 | ||||
| -rw-r--r-- | kernel/sched/rt.c (renamed from kernel/sched_rt.c) | 218 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 1136 | ||||
| -rw-r--r-- | kernel/sched/stats.c | 111 | ||||
| -rw-r--r-- | kernel/sched/stats.h (renamed from kernel/sched_stats.h) | 103 | ||||
| -rw-r--r-- | kernel/sched/stop_task.c (renamed from kernel/sched_stoptask.c) | 4 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 9 |
29 files changed, 2606 insertions, 2373 deletions
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c index 92f1cb745d69..4de031d6b76c 100644 --- a/arch/s390/appldata/appldata_os.c +++ b/arch/s390/appldata/appldata_os.c | |||
| @@ -115,21 +115,21 @@ static void appldata_get_os_data(void *data) | |||
| 115 | j = 0; | 115 | j = 0; |
| 116 | for_each_online_cpu(i) { | 116 | for_each_online_cpu(i) { |
| 117 | os_data->os_cpu[j].per_cpu_user = | 117 | os_data->os_cpu[j].per_cpu_user = |
| 118 | cputime_to_jiffies(kstat_cpu(i).cpustat.user); | 118 | cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_USER]); |
| 119 | os_data->os_cpu[j].per_cpu_nice = | 119 | os_data->os_cpu[j].per_cpu_nice = |
| 120 | cputime_to_jiffies(kstat_cpu(i).cpustat.nice); | 120 | cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_NICE]); |
| 121 | os_data->os_cpu[j].per_cpu_system = | 121 | os_data->os_cpu[j].per_cpu_system = |
| 122 | cputime_to_jiffies(kstat_cpu(i).cpustat.system); | 122 | cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]); |
| 123 | os_data->os_cpu[j].per_cpu_idle = | 123 | os_data->os_cpu[j].per_cpu_idle = |
| 124 | cputime_to_jiffies(kstat_cpu(i).cpustat.idle); | 124 | cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IDLE]); |
| 125 | os_data->os_cpu[j].per_cpu_irq = | 125 | os_data->os_cpu[j].per_cpu_irq = |
| 126 | cputime_to_jiffies(kstat_cpu(i).cpustat.irq); | 126 | cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IRQ]); |
| 127 | os_data->os_cpu[j].per_cpu_softirq = | 127 | os_data->os_cpu[j].per_cpu_softirq = |
| 128 | cputime_to_jiffies(kstat_cpu(i).cpustat.softirq); | 128 | cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]); |
| 129 | os_data->os_cpu[j].per_cpu_iowait = | 129 | os_data->os_cpu[j].per_cpu_iowait = |
| 130 | cputime_to_jiffies(kstat_cpu(i).cpustat.iowait); | 130 | cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IOWAIT]); |
| 131 | os_data->os_cpu[j].per_cpu_steal = | 131 | os_data->os_cpu[j].per_cpu_steal = |
| 132 | cputime_to_jiffies(kstat_cpu(i).cpustat.steal); | 132 | cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_STEAL]); |
| 133 | os_data->os_cpu[j].cpu_id = i; | 133 | os_data->os_cpu[j].cpu_id = i; |
| 134 | j++; | 134 | j++; |
| 135 | } | 135 | } |
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index c9e09ea05644..6919e936345b 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h | |||
| @@ -218,7 +218,7 @@ static inline void fpu_fxsave(struct fpu *fpu) | |||
| 218 | #ifdef CONFIG_SMP | 218 | #ifdef CONFIG_SMP |
| 219 | #define safe_address (__per_cpu_offset[0]) | 219 | #define safe_address (__per_cpu_offset[0]) |
| 220 | #else | 220 | #else |
| 221 | #define safe_address (kstat_cpu(0).cpustat.user) | 221 | #define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER]) |
| 222 | #endif | 222 | #endif |
| 223 | 223 | ||
| 224 | /* | 224 | /* |
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 7f31a031c0b5..235a340e81f2 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c | |||
| @@ -95,26 +95,26 @@ static struct dbs_tuners { | |||
| 95 | .freq_step = 5, | 95 | .freq_step = 5, |
| 96 | }; | 96 | }; |
| 97 | 97 | ||
| 98 | static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, | 98 | static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall) |
| 99 | cputime64_t *wall) | ||
| 100 | { | 99 | { |
| 101 | cputime64_t idle_time; | 100 | u64 idle_time; |
| 102 | cputime64_t cur_wall_time; | 101 | u64 cur_wall_time; |
| 103 | cputime64_t busy_time; | 102 | u64 busy_time; |
| 104 | 103 | ||
| 105 | cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); | 104 | cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); |
| 106 | busy_time = kstat_cpu(cpu).cpustat.user; | 105 | |
| 107 | busy_time += kstat_cpu(cpu).cpustat.system; | 106 | busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER]; |
| 108 | busy_time += kstat_cpu(cpu).cpustat.irq; | 107 | busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM]; |
| 109 | busy_time += kstat_cpu(cpu).cpustat.softirq; | 108 | busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ]; |
| 110 | busy_time += kstat_cpu(cpu).cpustat.steal; | 109 | busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ]; |
| 111 | busy_time += kstat_cpu(cpu).cpustat.nice; | 110 | busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; |
| 111 | busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE]; | ||
| 112 | 112 | ||
| 113 | idle_time = cur_wall_time - busy_time; | 113 | idle_time = cur_wall_time - busy_time; |
| 114 | if (wall) | 114 | if (wall) |
| 115 | *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time); | 115 | *wall = jiffies_to_usecs(cur_wall_time); |
| 116 | 116 | ||
| 117 | return (cputime64_t)jiffies_to_usecs(idle_time); | 117 | return jiffies_to_usecs(idle_time); |
| 118 | } | 118 | } |
| 119 | 119 | ||
| 120 | static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) | 120 | static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) |
| @@ -271,7 +271,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b, | |||
| 271 | dbs_info->prev_cpu_idle = get_cpu_idle_time(j, | 271 | dbs_info->prev_cpu_idle = get_cpu_idle_time(j, |
| 272 | &dbs_info->prev_cpu_wall); | 272 | &dbs_info->prev_cpu_wall); |
| 273 | if (dbs_tuners_ins.ignore_nice) | 273 | if (dbs_tuners_ins.ignore_nice) |
| 274 | dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; | 274 | dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; |
| 275 | } | 275 | } |
| 276 | return count; | 276 | return count; |
| 277 | } | 277 | } |
| @@ -361,11 +361,11 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) | |||
| 361 | j_dbs_info->prev_cpu_idle = cur_idle_time; | 361 | j_dbs_info->prev_cpu_idle = cur_idle_time; |
| 362 | 362 | ||
| 363 | if (dbs_tuners_ins.ignore_nice) { | 363 | if (dbs_tuners_ins.ignore_nice) { |
| 364 | cputime64_t cur_nice; | 364 | u64 cur_nice; |
| 365 | unsigned long cur_nice_jiffies; | 365 | unsigned long cur_nice_jiffies; |
| 366 | 366 | ||
| 367 | cur_nice = kstat_cpu(j).cpustat.nice - | 367 | cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] - |
| 368 | j_dbs_info->prev_cpu_nice; | 368 | j_dbs_info->prev_cpu_nice; |
| 369 | /* | 369 | /* |
| 370 | * Assumption: nice time between sampling periods will | 370 | * Assumption: nice time between sampling periods will |
| 371 | * be less than 2^32 jiffies for 32 bit sys | 371 | * be less than 2^32 jiffies for 32 bit sys |
| @@ -373,7 +373,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) | |||
| 373 | cur_nice_jiffies = (unsigned long) | 373 | cur_nice_jiffies = (unsigned long) |
| 374 | cputime64_to_jiffies64(cur_nice); | 374 | cputime64_to_jiffies64(cur_nice); |
| 375 | 375 | ||
| 376 | j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; | 376 | j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; |
| 377 | idle_time += jiffies_to_usecs(cur_nice_jiffies); | 377 | idle_time += jiffies_to_usecs(cur_nice_jiffies); |
| 378 | } | 378 | } |
| 379 | 379 | ||
| @@ -500,10 +500,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, | |||
| 500 | 500 | ||
| 501 | j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, | 501 | j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, |
| 502 | &j_dbs_info->prev_cpu_wall); | 502 | &j_dbs_info->prev_cpu_wall); |
| 503 | if (dbs_tuners_ins.ignore_nice) { | 503 | if (dbs_tuners_ins.ignore_nice) |
| 504 | j_dbs_info->prev_cpu_nice = | 504 | j_dbs_info->prev_cpu_nice = |
| 505 | kstat_cpu(j).cpustat.nice; | 505 | kcpustat_cpu(j).cpustat[CPUTIME_NICE]; |
| 506 | } | ||
| 507 | } | 506 | } |
| 508 | this_dbs_info->down_skip = 0; | 507 | this_dbs_info->down_skip = 0; |
| 509 | this_dbs_info->requested_freq = policy->cur; | 508 | this_dbs_info->requested_freq = policy->cur; |
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 07cffe2f6cff..3d679eee70a1 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c | |||
| @@ -119,26 +119,26 @@ static struct dbs_tuners { | |||
| 119 | .powersave_bias = 0, | 119 | .powersave_bias = 0, |
| 120 | }; | 120 | }; |
| 121 | 121 | ||
| 122 | static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, | 122 | static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall) |
| 123 | cputime64_t *wall) | ||
| 124 | { | 123 | { |
| 125 | cputime64_t idle_time; | 124 | u64 idle_time; |
| 126 | cputime64_t cur_wall_time; | 125 | u64 cur_wall_time; |
| 127 | cputime64_t busy_time; | 126 | u64 busy_time; |
| 128 | 127 | ||
| 129 | cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); | 128 | cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); |
| 130 | busy_time = kstat_cpu(cpu).cpustat.user; | 129 | |
| 131 | busy_time += kstat_cpu(cpu).cpustat.system; | 130 | busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER]; |
| 132 | busy_time += kstat_cpu(cpu).cpustat.irq; | 131 | busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM]; |
| 133 | busy_time += kstat_cpu(cpu).cpustat.softirq; | 132 | busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ]; |
| 134 | busy_time += kstat_cpu(cpu).cpustat.steal; | 133 | busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ]; |
| 135 | busy_time += kstat_cpu(cpu).cpustat.nice; | 134 | busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; |
| 135 | busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE]; | ||
| 136 | 136 | ||
| 137 | idle_time = cur_wall_time - busy_time; | 137 | idle_time = cur_wall_time - busy_time; |
| 138 | if (wall) | 138 | if (wall) |
| 139 | *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time); | 139 | *wall = jiffies_to_usecs(cur_wall_time); |
| 140 | 140 | ||
| 141 | return (cputime64_t)jiffies_to_usecs(idle_time); | 141 | return jiffies_to_usecs(idle_time); |
| 142 | } | 142 | } |
| 143 | 143 | ||
| 144 | static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) | 144 | static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) |
| @@ -344,7 +344,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b, | |||
| 344 | dbs_info->prev_cpu_idle = get_cpu_idle_time(j, | 344 | dbs_info->prev_cpu_idle = get_cpu_idle_time(j, |
| 345 | &dbs_info->prev_cpu_wall); | 345 | &dbs_info->prev_cpu_wall); |
| 346 | if (dbs_tuners_ins.ignore_nice) | 346 | if (dbs_tuners_ins.ignore_nice) |
| 347 | dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; | 347 | dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; |
| 348 | 348 | ||
| 349 | } | 349 | } |
| 350 | return count; | 350 | return count; |
| @@ -454,11 +454,11 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) | |||
| 454 | j_dbs_info->prev_cpu_iowait = cur_iowait_time; | 454 | j_dbs_info->prev_cpu_iowait = cur_iowait_time; |
| 455 | 455 | ||
| 456 | if (dbs_tuners_ins.ignore_nice) { | 456 | if (dbs_tuners_ins.ignore_nice) { |
| 457 | cputime64_t cur_nice; | 457 | u64 cur_nice; |
| 458 | unsigned long cur_nice_jiffies; | 458 | unsigned long cur_nice_jiffies; |
| 459 | 459 | ||
| 460 | cur_nice = kstat_cpu(j).cpustat.nice - | 460 | cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] - |
| 461 | j_dbs_info->prev_cpu_nice; | 461 | j_dbs_info->prev_cpu_nice; |
| 462 | /* | 462 | /* |
| 463 | * Assumption: nice time between sampling periods will | 463 | * Assumption: nice time between sampling periods will |
| 464 | * be less than 2^32 jiffies for 32 bit sys | 464 | * be less than 2^32 jiffies for 32 bit sys |
| @@ -466,7 +466,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) | |||
| 466 | cur_nice_jiffies = (unsigned long) | 466 | cur_nice_jiffies = (unsigned long) |
| 467 | cputime64_to_jiffies64(cur_nice); | 467 | cputime64_to_jiffies64(cur_nice); |
| 468 | 468 | ||
| 469 | j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; | 469 | j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; |
| 470 | idle_time += jiffies_to_usecs(cur_nice_jiffies); | 470 | idle_time += jiffies_to_usecs(cur_nice_jiffies); |
| 471 | } | 471 | } |
| 472 | 472 | ||
| @@ -645,10 +645,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, | |||
| 645 | 645 | ||
| 646 | j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, | 646 | j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, |
| 647 | &j_dbs_info->prev_cpu_wall); | 647 | &j_dbs_info->prev_cpu_wall); |
| 648 | if (dbs_tuners_ins.ignore_nice) { | 648 | if (dbs_tuners_ins.ignore_nice) |
| 649 | j_dbs_info->prev_cpu_nice = | 649 | j_dbs_info->prev_cpu_nice = |
| 650 | kstat_cpu(j).cpustat.nice; | 650 | kcpustat_cpu(j).cpustat[CPUTIME_NICE]; |
| 651 | } | ||
| 652 | } | 651 | } |
| 653 | this_dbs_info->cpu = cpu; | 652 | this_dbs_info->cpu = cpu; |
| 654 | this_dbs_info->rate_mult = 1; | 653 | this_dbs_info->rate_mult = 1; |
diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c index 909908ebf164..6dc26b61219b 100644 --- a/drivers/macintosh/rack-meter.c +++ b/drivers/macintosh/rack-meter.c | |||
| @@ -81,12 +81,13 @@ static int rackmeter_ignore_nice; | |||
| 81 | */ | 81 | */ |
| 82 | static inline cputime64_t get_cpu_idle_time(unsigned int cpu) | 82 | static inline cputime64_t get_cpu_idle_time(unsigned int cpu) |
| 83 | { | 83 | { |
| 84 | cputime64_t retval; | 84 | u64 retval; |
| 85 | 85 | ||
| 86 | retval = kstat_cpu(cpu).cpustat.idle + kstat_cpu(cpu).cpustat.iowait; | 86 | retval = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE] + |
| 87 | kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; | ||
| 87 | 88 | ||
| 88 | if (rackmeter_ignore_nice) | 89 | if (rackmeter_ignore_nice) |
| 89 | retval += kstat_cpu(cpu).cpustat.nice; | 90 | retval += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE]; |
| 90 | 91 | ||
| 91 | return retval; | 92 | return retval; |
| 92 | } | 93 | } |
diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 714d5d131e76..2527a68057fc 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c | |||
| @@ -22,14 +22,13 @@ | |||
| 22 | #define arch_idle_time(cpu) 0 | 22 | #define arch_idle_time(cpu) 0 |
| 23 | #endif | 23 | #endif |
| 24 | 24 | ||
| 25 | static cputime64_t get_idle_time(int cpu) | 25 | static u64 get_idle_time(int cpu) |
| 26 | { | 26 | { |
| 27 | u64 idle_time = get_cpu_idle_time_us(cpu, NULL); | 27 | u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL); |
| 28 | cputime64_t idle; | ||
| 29 | 28 | ||
| 30 | if (idle_time == -1ULL) { | 29 | if (idle_time == -1ULL) { |
| 31 | /* !NO_HZ so we can rely on cpustat.idle */ | 30 | /* !NO_HZ so we can rely on cpustat.idle */ |
| 32 | idle = kstat_cpu(cpu).cpustat.idle; | 31 | idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; |
| 33 | idle += arch_idle_time(cpu); | 32 | idle += arch_idle_time(cpu); |
| 34 | } else | 33 | } else |
| 35 | idle = nsecs_to_jiffies64(1000 * idle_time); | 34 | idle = nsecs_to_jiffies64(1000 * idle_time); |
| @@ -37,14 +36,13 @@ static cputime64_t get_idle_time(int cpu) | |||
| 37 | return idle; | 36 | return idle; |
| 38 | } | 37 | } |
| 39 | 38 | ||
| 40 | static cputime64_t get_iowait_time(int cpu) | 39 | static u64 get_iowait_time(int cpu) |
| 41 | { | 40 | { |
| 42 | u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL); | 41 | u64 iowait, iowait_time = get_cpu_iowait_time_us(cpu, NULL); |
| 43 | cputime64_t iowait; | ||
| 44 | 42 | ||
| 45 | if (iowait_time == -1ULL) | 43 | if (iowait_time == -1ULL) |
| 46 | /* !NO_HZ so we can rely on cpustat.iowait */ | 44 | /* !NO_HZ so we can rely on cpustat.iowait */ |
| 47 | iowait = kstat_cpu(cpu).cpustat.iowait; | 45 | iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; |
| 48 | else | 46 | else |
| 49 | iowait = nsecs_to_jiffies64(1000 * iowait_time); | 47 | iowait = nsecs_to_jiffies64(1000 * iowait_time); |
| 50 | 48 | ||
| @@ -55,8 +53,8 @@ static int show_stat(struct seq_file *p, void *v) | |||
| 55 | { | 53 | { |
| 56 | int i, j; | 54 | int i, j; |
| 57 | unsigned long jif; | 55 | unsigned long jif; |
| 58 | cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; | 56 | u64 user, nice, system, idle, iowait, irq, softirq, steal; |
| 59 | cputime64_t guest, guest_nice; | 57 | u64 guest, guest_nice; |
| 60 | u64 sum = 0; | 58 | u64 sum = 0; |
| 61 | u64 sum_softirq = 0; | 59 | u64 sum_softirq = 0; |
| 62 | unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; | 60 | unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; |
| @@ -69,18 +67,16 @@ static int show_stat(struct seq_file *p, void *v) | |||
| 69 | jif = boottime.tv_sec; | 67 | jif = boottime.tv_sec; |
| 70 | 68 | ||
| 71 | for_each_possible_cpu(i) { | 69 | for_each_possible_cpu(i) { |
| 72 | user += kstat_cpu(i).cpustat.user; | 70 | user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; |
| 73 | nice += kstat_cpu(i).cpustat.nice; | 71 | nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; |
| 74 | system += kstat_cpu(i).cpustat.system; | 72 | system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; |
| 75 | idle += get_idle_time(i); | 73 | idle += get_idle_time(i); |
| 76 | iowait += get_iowait_time(i); | 74 | iowait += get_iowait_time(i); |
| 77 | irq += kstat_cpu(i).cpustat.irq; | 75 | irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; |
| 78 | softirq += kstat_cpu(i).cpustat.softirq; | 76 | softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; |
| 79 | steal += kstat_cpu(i).cpustat.steal; | 77 | steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; |
| 80 | guest += kstat_cpu(i).cpustat.guest; | 78 | guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; |
| 81 | guest_nice += kstat_cpu(i).cpustat.guest_nice; | 79 | guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; |
| 82 | sum += kstat_cpu_irqs_sum(i); | ||
| 83 | sum += arch_irq_stat_cpu(i); | ||
| 84 | 80 | ||
| 85 | for (j = 0; j < NR_SOFTIRQS; j++) { | 81 | for (j = 0; j < NR_SOFTIRQS; j++) { |
| 86 | unsigned int softirq_stat = kstat_softirqs_cpu(j, i); | 82 | unsigned int softirq_stat = kstat_softirqs_cpu(j, i); |
| @@ -105,16 +101,16 @@ static int show_stat(struct seq_file *p, void *v) | |||
| 105 | (unsigned long long)cputime64_to_clock_t(guest_nice)); | 101 | (unsigned long long)cputime64_to_clock_t(guest_nice)); |
| 106 | for_each_online_cpu(i) { | 102 | for_each_online_cpu(i) { |
| 107 | /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ | 103 | /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ |
| 108 | user = kstat_cpu(i).cpustat.user; | 104 | user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; |
| 109 | nice = kstat_cpu(i).cpustat.nice; | 105 | nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; |
| 110 | system = kstat_cpu(i).cpustat.system; | 106 | system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; |
| 111 | idle = get_idle_time(i); | 107 | idle = get_idle_time(i); |
| 112 | iowait = get_iowait_time(i); | 108 | iowait = get_iowait_time(i); |
| 113 | irq = kstat_cpu(i).cpustat.irq; | 109 | irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; |
| 114 | softirq = kstat_cpu(i).cpustat.softirq; | 110 | softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; |
| 115 | steal = kstat_cpu(i).cpustat.steal; | 111 | steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; |
| 116 | guest = kstat_cpu(i).cpustat.guest; | 112 | guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; |
| 117 | guest_nice = kstat_cpu(i).cpustat.guest_nice; | 113 | guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; |
| 118 | seq_printf(p, | 114 | seq_printf(p, |
| 119 | "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " | 115 | "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " |
| 120 | "%llu\n", | 116 | "%llu\n", |
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index ab515109fec9..9610ac772d7e 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c | |||
| @@ -11,14 +11,14 @@ static int uptime_proc_show(struct seq_file *m, void *v) | |||
| 11 | { | 11 | { |
| 12 | struct timespec uptime; | 12 | struct timespec uptime; |
| 13 | struct timespec idle; | 13 | struct timespec idle; |
| 14 | cputime64_t idletime; | 14 | u64 idletime; |
| 15 | u64 nsec; | 15 | u64 nsec; |
| 16 | u32 rem; | 16 | u32 rem; |
| 17 | int i; | 17 | int i; |
| 18 | 18 | ||
| 19 | idletime = 0; | 19 | idletime = 0; |
| 20 | for_each_possible_cpu(i) | 20 | for_each_possible_cpu(i) |
| 21 | idletime += kstat_cpu(i).cpustat.idle; | 21 | idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; |
| 22 | 22 | ||
| 23 | do_posix_clock_monotonic_gettime(&uptime); | 23 | do_posix_clock_monotonic_gettime(&uptime); |
| 24 | monotonic_to_bootbased(&uptime); | 24 | monotonic_to_bootbased(&uptime); |
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 0cce2db580c3..2fbd9053c2df 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <linux/percpu.h> | 6 | #include <linux/percpu.h> |
| 7 | #include <linux/cpumask.h> | 7 | #include <linux/cpumask.h> |
| 8 | #include <linux/interrupt.h> | 8 | #include <linux/interrupt.h> |
| 9 | #include <linux/sched.h> | ||
| 9 | #include <asm/irq.h> | 10 | #include <asm/irq.h> |
| 10 | #include <asm/cputime.h> | 11 | #include <asm/cputime.h> |
| 11 | 12 | ||
| @@ -15,21 +16,25 @@ | |||
| 15 | * used by rstatd/perfmeter | 16 | * used by rstatd/perfmeter |
| 16 | */ | 17 | */ |
| 17 | 18 | ||
| 18 | struct cpu_usage_stat { | 19 | enum cpu_usage_stat { |
| 19 | cputime64_t user; | 20 | CPUTIME_USER, |
| 20 | cputime64_t nice; | 21 | CPUTIME_NICE, |
| 21 | cputime64_t system; | 22 | CPUTIME_SYSTEM, |
| 22 | cputime64_t softirq; | 23 | CPUTIME_SOFTIRQ, |
| 23 | cputime64_t irq; | 24 | CPUTIME_IRQ, |
| 24 | cputime64_t idle; | 25 | CPUTIME_IDLE, |
| 25 | cputime64_t iowait; | 26 | CPUTIME_IOWAIT, |
| 26 | cputime64_t steal; | 27 | CPUTIME_STEAL, |
| 27 | cputime64_t guest; | 28 | CPUTIME_GUEST, |
| 28 | cputime64_t guest_nice; | 29 | CPUTIME_GUEST_NICE, |
| 30 | NR_STATS, | ||
| 31 | }; | ||
| 32 | |||
| 33 | struct kernel_cpustat { | ||
| 34 | u64 cpustat[NR_STATS]; | ||
| 29 | }; | 35 | }; |
| 30 | 36 | ||
| 31 | struct kernel_stat { | 37 | struct kernel_stat { |
| 32 | struct cpu_usage_stat cpustat; | ||
| 33 | #ifndef CONFIG_GENERIC_HARDIRQS | 38 | #ifndef CONFIG_GENERIC_HARDIRQS |
| 34 | unsigned int irqs[NR_IRQS]; | 39 | unsigned int irqs[NR_IRQS]; |
| 35 | #endif | 40 | #endif |
| @@ -38,10 +43,13 @@ struct kernel_stat { | |||
| 38 | }; | 43 | }; |
| 39 | 44 | ||
| 40 | DECLARE_PER_CPU(struct kernel_stat, kstat); | 45 | DECLARE_PER_CPU(struct kernel_stat, kstat); |
| 46 | DECLARE_PER_CPU(struct kernel_cpustat, kernel_cpustat); | ||
| 41 | 47 | ||
| 42 | #define kstat_cpu(cpu) per_cpu(kstat, cpu) | ||
| 43 | /* Must have preemption disabled for this to be meaningful. */ | 48 | /* Must have preemption disabled for this to be meaningful. */ |
| 44 | #define kstat_this_cpu __get_cpu_var(kstat) | 49 | #define kstat_this_cpu (&__get_cpu_var(kstat)) |
| 50 | #define kcpustat_this_cpu (&__get_cpu_var(kernel_cpustat)) | ||
| 51 | #define kstat_cpu(cpu) per_cpu(kstat, cpu) | ||
| 52 | #define kcpustat_cpu(cpu) per_cpu(kernel_cpustat, cpu) | ||
| 45 | 53 | ||
| 46 | extern unsigned long long nr_context_switches(void); | 54 | extern unsigned long long nr_context_switches(void); |
| 47 | 55 | ||
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index b0e99898527c..e23121f9d82a 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h | |||
| @@ -10,6 +10,8 @@ | |||
| 10 | #define _INCLUDE_GUARD_LATENCYTOP_H_ | 10 | #define _INCLUDE_GUARD_LATENCYTOP_H_ |
| 11 | 11 | ||
| 12 | #include <linux/compiler.h> | 12 | #include <linux/compiler.h> |
| 13 | struct task_struct; | ||
| 14 | |||
| 13 | #ifdef CONFIG_LATENCYTOP | 15 | #ifdef CONFIG_LATENCYTOP |
| 14 | 16 | ||
| 15 | #define LT_SAVECOUNT 32 | 17 | #define LT_SAVECOUNT 32 |
| @@ -23,7 +25,6 @@ struct latency_record { | |||
| 23 | }; | 25 | }; |
| 24 | 26 | ||
| 25 | 27 | ||
| 26 | struct task_struct; | ||
| 27 | 28 | ||
| 28 | extern int latencytop_enabled; | 29 | extern int latencytop_enabled; |
| 29 | void __account_scheduler_latency(struct task_struct *task, int usecs, int inter); | 30 | void __account_scheduler_latency(struct task_struct *task, int usecs, int inter); |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 5649032d73fe..5a2ab3c2757d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -273,9 +273,11 @@ extern int runqueue_is_locked(int cpu); | |||
| 273 | 273 | ||
| 274 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) | 274 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) |
| 275 | extern void select_nohz_load_balancer(int stop_tick); | 275 | extern void select_nohz_load_balancer(int stop_tick); |
| 276 | extern void set_cpu_sd_state_idle(void); | ||
| 276 | extern int get_nohz_timer_target(void); | 277 | extern int get_nohz_timer_target(void); |
| 277 | #else | 278 | #else |
| 278 | static inline void select_nohz_load_balancer(int stop_tick) { } | 279 | static inline void select_nohz_load_balancer(int stop_tick) { } |
| 280 | static inline void set_cpu_sd_state_idle(void) { } | ||
| 279 | #endif | 281 | #endif |
| 280 | 282 | ||
| 281 | /* | 283 | /* |
| @@ -901,6 +903,10 @@ struct sched_group_power { | |||
| 901 | * single CPU. | 903 | * single CPU. |
| 902 | */ | 904 | */ |
| 903 | unsigned int power, power_orig; | 905 | unsigned int power, power_orig; |
| 906 | /* | ||
| 907 | * Number of busy cpus in this group. | ||
| 908 | */ | ||
| 909 | atomic_t nr_busy_cpus; | ||
| 904 | }; | 910 | }; |
| 905 | 911 | ||
| 906 | struct sched_group { | 912 | struct sched_group { |
| @@ -925,6 +931,15 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) | |||
| 925 | return to_cpumask(sg->cpumask); | 931 | return to_cpumask(sg->cpumask); |
| 926 | } | 932 | } |
| 927 | 933 | ||
| 934 | /** | ||
| 935 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | ||
| 936 | * @group: The group whose first cpu is to be returned. | ||
| 937 | */ | ||
| 938 | static inline unsigned int group_first_cpu(struct sched_group *group) | ||
| 939 | { | ||
| 940 | return cpumask_first(sched_group_cpus(group)); | ||
| 941 | } | ||
| 942 | |||
| 928 | struct sched_domain_attr { | 943 | struct sched_domain_attr { |
| 929 | int relax_domain_level; | 944 | int relax_domain_level; |
| 930 | }; | 945 | }; |
| @@ -1315,8 +1330,8 @@ struct task_struct { | |||
| 1315 | * older sibling, respectively. (p->father can be replaced with | 1330 | * older sibling, respectively. (p->father can be replaced with |
| 1316 | * p->real_parent->pid) | 1331 | * p->real_parent->pid) |
| 1317 | */ | 1332 | */ |
| 1318 | struct task_struct *real_parent; /* real parent process */ | 1333 | struct task_struct __rcu *real_parent; /* real parent process */ |
| 1319 | struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */ | 1334 | struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */ |
| 1320 | /* | 1335 | /* |
| 1321 | * children/sibling forms the list of my natural children | 1336 | * children/sibling forms the list of my natural children |
| 1322 | */ | 1337 | */ |
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 959ff18b63b6..e33ed1bfa113 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h | |||
| @@ -331,6 +331,13 @@ DEFINE_EVENT(sched_stat_template, sched_stat_iowait, | |||
| 331 | TP_ARGS(tsk, delay)); | 331 | TP_ARGS(tsk, delay)); |
| 332 | 332 | ||
| 333 | /* | 333 | /* |
| 334 | * Tracepoint for accounting blocked time (time the task is in uninterruptible). | ||
| 335 | */ | ||
| 336 | DEFINE_EVENT(sched_stat_template, sched_stat_blocked, | ||
| 337 | TP_PROTO(struct task_struct *tsk, u64 delay), | ||
| 338 | TP_ARGS(tsk, delay)); | ||
| 339 | |||
| 340 | /* | ||
| 334 | * Tracepoint for accounting runtime (time the task is executing | 341 | * Tracepoint for accounting runtime (time the task is executing |
| 335 | * on a CPU). | 342 | * on a CPU). |
| 336 | */ | 343 | */ |
diff --git a/kernel/Makefile b/kernel/Makefile index e898c5b9d02c..f70396e5a24b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -2,16 +2,15 @@ | |||
| 2 | # Makefile for the linux kernel. | 2 | # Makefile for the linux kernel. |
| 3 | # | 3 | # |
| 4 | 4 | ||
| 5 | obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ | 5 | obj-y = fork.o exec_domain.o panic.o printk.o \ |
| 6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ | 6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ |
| 7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ |
| 8 | signal.o sys.o kmod.o workqueue.o pid.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o \ |
| 9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
| 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
| 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
| 12 | notifier.o ksysfs.o sched_clock.o cred.o \ | 12 | notifier.o ksysfs.o cred.o \ |
| 13 | async.o range.o | 13 | async.o range.o groups.o |
| 14 | obj-y += groups.o | ||
| 15 | 14 | ||
| 16 | ifdef CONFIG_FUNCTION_TRACER | 15 | ifdef CONFIG_FUNCTION_TRACER |
| 17 | # Do not trace debug files and internal ftrace files | 16 | # Do not trace debug files and internal ftrace files |
| @@ -20,10 +19,11 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg | |||
| 20 | CFLAGS_REMOVE_mutex-debug.o = -pg | 19 | CFLAGS_REMOVE_mutex-debug.o = -pg |
| 21 | CFLAGS_REMOVE_rtmutex-debug.o = -pg | 20 | CFLAGS_REMOVE_rtmutex-debug.o = -pg |
| 22 | CFLAGS_REMOVE_cgroup-debug.o = -pg | 21 | CFLAGS_REMOVE_cgroup-debug.o = -pg |
| 23 | CFLAGS_REMOVE_sched_clock.o = -pg | ||
| 24 | CFLAGS_REMOVE_irq_work.o = -pg | 22 | CFLAGS_REMOVE_irq_work.o = -pg |
| 25 | endif | 23 | endif |
| 26 | 24 | ||
| 25 | obj-y += sched/ | ||
| 26 | |||
| 27 | obj-$(CONFIG_FREEZER) += freezer.o | 27 | obj-$(CONFIG_FREEZER) += freezer.o |
| 28 | obj-$(CONFIG_PROFILING) += profile.o | 28 | obj-$(CONFIG_PROFILING) += profile.o |
| 29 | obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o | 29 | obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o |
| @@ -99,7 +99,6 @@ obj-$(CONFIG_TRACING) += trace/ | |||
| 99 | obj-$(CONFIG_X86_DS) += trace/ | 99 | obj-$(CONFIG_X86_DS) += trace/ |
| 100 | obj-$(CONFIG_RING_BUFFER) += trace/ | 100 | obj-$(CONFIG_RING_BUFFER) += trace/ |
| 101 | obj-$(CONFIG_TRACEPOINTS) += trace/ | 101 | obj-$(CONFIG_TRACEPOINTS) += trace/ |
| 102 | obj-$(CONFIG_SMP) += sched_cpupri.o | ||
| 103 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | 102 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
| 104 | obj-$(CONFIG_CPU_PM) += cpu_pm.o | 103 | obj-$(CONFIG_CPU_PM) += cpu_pm.o |
| 105 | 104 | ||
| @@ -110,15 +109,6 @@ obj-$(CONFIG_PADATA) += padata.o | |||
| 110 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | 109 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
| 111 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o | 110 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o |
| 112 | 111 | ||
| 113 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | ||
| 114 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | ||
| 115 | # needed for x86 only. Why this used to be enabled for all architectures is beyond | ||
| 116 | # me. I suspect most platforms don't need this, but until we know that for sure | ||
| 117 | # I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k | ||
| 118 | # to get a correct value for the wait-channel (WCHAN in ps). --davidm | ||
| 119 | CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer | ||
| 120 | endif | ||
| 121 | |||
| 122 | $(obj)/configs.o: $(obj)/config_data.h | 112 | $(obj)/configs.o: $(obj)/config_data.h |
| 123 | 113 | ||
| 124 | # config_data.h contains the same information as ikconfig.h but gzipped. | 114 | # config_data.h contains the same information as ikconfig.h but gzipped. |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile new file mode 100644 index 000000000000..9a7dd35102a3 --- /dev/null +++ b/kernel/sched/Makefile | |||
| @@ -0,0 +1,20 @@ | |||
| 1 | ifdef CONFIG_FUNCTION_TRACER | ||
| 2 | CFLAGS_REMOVE_clock.o = -pg | ||
| 3 | endif | ||
| 4 | |||
| 5 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | ||
| 6 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | ||
| 7 | # needed for x86 only. Why this used to be enabled for all architectures is beyond | ||
| 8 | # me. I suspect most platforms don't need this, but until we know that for sure | ||
| 9 | # I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k | ||
| 10 | # to get a correct value for the wait-channel (WCHAN in ps). --davidm | ||
| 11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | ||
| 12 | endif | ||
| 13 | |||
| 14 | obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o | ||
| 15 | obj-$(CONFIG_SMP) += cpupri.o | ||
| 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | ||
| 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | ||
| 18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | ||
| 19 | |||
| 20 | |||
diff --git a/kernel/sched_autogroup.c b/kernel/sched/auto_group.c index 429242f3c484..e8a1f83ee0e7 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched/auto_group.c | |||
| @@ -1,15 +1,19 @@ | |||
| 1 | #ifdef CONFIG_SCHED_AUTOGROUP | 1 | #ifdef CONFIG_SCHED_AUTOGROUP |
| 2 | 2 | ||
| 3 | #include "sched.h" | ||
| 4 | |||
| 3 | #include <linux/proc_fs.h> | 5 | #include <linux/proc_fs.h> |
| 4 | #include <linux/seq_file.h> | 6 | #include <linux/seq_file.h> |
| 5 | #include <linux/kallsyms.h> | 7 | #include <linux/kallsyms.h> |
| 6 | #include <linux/utsname.h> | 8 | #include <linux/utsname.h> |
| 9 | #include <linux/security.h> | ||
| 10 | #include <linux/export.h> | ||
| 7 | 11 | ||
| 8 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; | 12 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; |
| 9 | static struct autogroup autogroup_default; | 13 | static struct autogroup autogroup_default; |
| 10 | static atomic_t autogroup_seq_nr; | 14 | static atomic_t autogroup_seq_nr; |
| 11 | 15 | ||
| 12 | static void __init autogroup_init(struct task_struct *init_task) | 16 | void __init autogroup_init(struct task_struct *init_task) |
| 13 | { | 17 | { |
| 14 | autogroup_default.tg = &root_task_group; | 18 | autogroup_default.tg = &root_task_group; |
| 15 | kref_init(&autogroup_default.kref); | 19 | kref_init(&autogroup_default.kref); |
| @@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task) | |||
| 17 | init_task->signal->autogroup = &autogroup_default; | 21 | init_task->signal->autogroup = &autogroup_default; |
| 18 | } | 22 | } |
| 19 | 23 | ||
| 20 | static inline void autogroup_free(struct task_group *tg) | 24 | void autogroup_free(struct task_group *tg) |
| 21 | { | 25 | { |
| 22 | kfree(tg->autogroup); | 26 | kfree(tg->autogroup); |
| 23 | } | 27 | } |
| @@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p) | |||
| 59 | return ag; | 63 | return ag; |
| 60 | } | 64 | } |
| 61 | 65 | ||
| 62 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 63 | static void free_rt_sched_group(struct task_group *tg); | ||
| 64 | #endif | ||
| 65 | |||
| 66 | static inline struct autogroup *autogroup_create(void) | 66 | static inline struct autogroup *autogroup_create(void) |
| 67 | { | 67 | { |
| 68 | struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); | 68 | struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); |
| @@ -108,8 +108,7 @@ out_fail: | |||
| 108 | return autogroup_kref_get(&autogroup_default); | 108 | return autogroup_kref_get(&autogroup_default); |
| 109 | } | 109 | } |
| 110 | 110 | ||
| 111 | static inline bool | 111 | bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) |
| 112 | task_wants_autogroup(struct task_struct *p, struct task_group *tg) | ||
| 113 | { | 112 | { |
| 114 | if (tg != &root_task_group) | 113 | if (tg != &root_task_group) |
| 115 | return false; | 114 | return false; |
| @@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) | |||
| 127 | return true; | 126 | return true; |
| 128 | } | 127 | } |
| 129 | 128 | ||
| 130 | static inline bool task_group_is_autogroup(struct task_group *tg) | ||
| 131 | { | ||
| 132 | return !!tg->autogroup; | ||
| 133 | } | ||
| 134 | |||
| 135 | static inline struct task_group * | ||
| 136 | autogroup_task_group(struct task_struct *p, struct task_group *tg) | ||
| 137 | { | ||
| 138 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | ||
| 139 | |||
| 140 | if (enabled && task_wants_autogroup(p, tg)) | ||
| 141 | return p->signal->autogroup->tg; | ||
| 142 | |||
| 143 | return tg; | ||
| 144 | } | ||
| 145 | |||
| 146 | static void | 129 | static void |
| 147 | autogroup_move_group(struct task_struct *p, struct autogroup *ag) | 130 | autogroup_move_group(struct task_struct *p, struct autogroup *ag) |
| 148 | { | 131 | { |
| @@ -263,7 +246,7 @@ out: | |||
| 263 | #endif /* CONFIG_PROC_FS */ | 246 | #endif /* CONFIG_PROC_FS */ |
| 264 | 247 | ||
| 265 | #ifdef CONFIG_SCHED_DEBUG | 248 | #ifdef CONFIG_SCHED_DEBUG |
| 266 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | 249 | int autogroup_path(struct task_group *tg, char *buf, int buflen) |
| 267 | { | 250 | { |
| 268 | if (!task_group_is_autogroup(tg)) | 251 | if (!task_group_is_autogroup(tg)) |
| 269 | return 0; | 252 | return 0; |
diff --git a/kernel/sched_autogroup.h b/kernel/sched/auto_group.h index c2f0e7248dca..8bd047142816 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched/auto_group.h | |||
| @@ -1,5 +1,8 @@ | |||
| 1 | #ifdef CONFIG_SCHED_AUTOGROUP | 1 | #ifdef CONFIG_SCHED_AUTOGROUP |
| 2 | 2 | ||
| 3 | #include <linux/kref.h> | ||
| 4 | #include <linux/rwsem.h> | ||
| 5 | |||
| 3 | struct autogroup { | 6 | struct autogroup { |
| 4 | /* | 7 | /* |
| 5 | * reference doesn't mean how many thread attach to this | 8 | * reference doesn't mean how many thread attach to this |
| @@ -13,9 +16,28 @@ struct autogroup { | |||
| 13 | int nice; | 16 | int nice; |
| 14 | }; | 17 | }; |
| 15 | 18 | ||
| 16 | static inline bool task_group_is_autogroup(struct task_group *tg); | 19 | extern void autogroup_init(struct task_struct *init_task); |
| 20 | extern void autogroup_free(struct task_group *tg); | ||
| 21 | |||
| 22 | static inline bool task_group_is_autogroup(struct task_group *tg) | ||
| 23 | { | ||
| 24 | return !!tg->autogroup; | ||
| 25 | } | ||
| 26 | |||
| 27 | extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); | ||
| 28 | |||
| 17 | static inline struct task_group * | 29 | static inline struct task_group * |
| 18 | autogroup_task_group(struct task_struct *p, struct task_group *tg); | 30 | autogroup_task_group(struct task_struct *p, struct task_group *tg) |
| 31 | { | ||
| 32 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | ||
| 33 | |||
| 34 | if (enabled && task_wants_autogroup(p, tg)) | ||
| 35 | return p->signal->autogroup->tg; | ||
| 36 | |||
| 37 | return tg; | ||
| 38 | } | ||
| 39 | |||
| 40 | extern int autogroup_path(struct task_group *tg, char *buf, int buflen); | ||
| 19 | 41 | ||
| 20 | #else /* !CONFIG_SCHED_AUTOGROUP */ | 42 | #else /* !CONFIG_SCHED_AUTOGROUP */ |
| 21 | 43 | ||
diff --git a/kernel/sched_clock.c b/kernel/sched/clock.c index c685e31492df..c685e31492df 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched/clock.c | |||
diff --git a/kernel/sched.c b/kernel/sched/core.c index 18cad4467e61..cdf51a2adc26 100644 --- a/kernel/sched.c +++ b/kernel/sched/core.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * kernel/sched.c | 2 | * kernel/sched/core.c |
| 3 | * | 3 | * |
| 4 | * Kernel scheduler and related syscalls | 4 | * Kernel scheduler and related syscalls |
| 5 | * | 5 | * |
| @@ -56,7 +56,6 @@ | |||
| 56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
| 57 | #include <linux/proc_fs.h> | 57 | #include <linux/proc_fs.h> |
| 58 | #include <linux/seq_file.h> | 58 | #include <linux/seq_file.h> |
| 59 | #include <linux/stop_machine.h> | ||
| 60 | #include <linux/sysctl.h> | 59 | #include <linux/sysctl.h> |
| 61 | #include <linux/syscalls.h> | 60 | #include <linux/syscalls.h> |
| 62 | #include <linux/times.h> | 61 | #include <linux/times.h> |
| @@ -75,129 +74,17 @@ | |||
| 75 | 74 | ||
| 76 | #include <asm/tlb.h> | 75 | #include <asm/tlb.h> |
| 77 | #include <asm/irq_regs.h> | 76 | #include <asm/irq_regs.h> |
| 78 | #include <asm/mutex.h> | ||
| 79 | #ifdef CONFIG_PARAVIRT | 77 | #ifdef CONFIG_PARAVIRT |
| 80 | #include <asm/paravirt.h> | 78 | #include <asm/paravirt.h> |
| 81 | #endif | 79 | #endif |
| 82 | 80 | ||
| 83 | #include "sched_cpupri.h" | 81 | #include "sched.h" |
| 84 | #include "workqueue_sched.h" | 82 | #include "../workqueue_sched.h" |
| 85 | #include "sched_autogroup.h" | ||
| 86 | 83 | ||
| 87 | #define CREATE_TRACE_POINTS | 84 | #define CREATE_TRACE_POINTS |
| 88 | #include <trace/events/sched.h> | 85 | #include <trace/events/sched.h> |
| 89 | 86 | ||
| 90 | /* | 87 | void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) |
| 91 | * Convert user-nice values [ -20 ... 0 ... 19 ] | ||
| 92 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | ||
| 93 | * and back. | ||
| 94 | */ | ||
| 95 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | ||
| 96 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | ||
| 97 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | ||
| 98 | |||
| 99 | /* | ||
| 100 | * 'User priority' is the nice value converted to something we | ||
| 101 | * can work with better when scaling various scheduler parameters, | ||
| 102 | * it's a [ 0 ... 39 ] range. | ||
| 103 | */ | ||
| 104 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | ||
| 105 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | ||
| 106 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | ||
| 107 | |||
| 108 | /* | ||
| 109 | * Helpers for converting nanosecond timing to jiffy resolution | ||
| 110 | */ | ||
| 111 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | ||
| 112 | |||
| 113 | #define NICE_0_LOAD SCHED_LOAD_SCALE | ||
| 114 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | ||
| 115 | |||
| 116 | /* | ||
| 117 | * These are the 'tuning knobs' of the scheduler: | ||
| 118 | * | ||
| 119 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). | ||
| 120 | * Timeslices get refilled after they expire. | ||
| 121 | */ | ||
| 122 | #define DEF_TIMESLICE (100 * HZ / 1000) | ||
| 123 | |||
| 124 | /* | ||
| 125 | * single value that denotes runtime == period, ie unlimited time. | ||
| 126 | */ | ||
| 127 | #define RUNTIME_INF ((u64)~0ULL) | ||
| 128 | |||
| 129 | static inline int rt_policy(int policy) | ||
| 130 | { | ||
| 131 | if (policy == SCHED_FIFO || policy == SCHED_RR) | ||
| 132 | return 1; | ||
| 133 | return 0; | ||
| 134 | } | ||
| 135 | |||
| 136 | static inline int task_has_rt_policy(struct task_struct *p) | ||
| 137 | { | ||
| 138 | return rt_policy(p->policy); | ||
| 139 | } | ||
| 140 | |||
| 141 | /* | ||
| 142 | * This is the priority-queue data structure of the RT scheduling class: | ||
| 143 | */ | ||
| 144 | struct rt_prio_array { | ||
| 145 | DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ | ||
| 146 | struct list_head queue[MAX_RT_PRIO]; | ||
| 147 | }; | ||
| 148 | |||
| 149 | struct rt_bandwidth { | ||
| 150 | /* nests inside the rq lock: */ | ||
| 151 | raw_spinlock_t rt_runtime_lock; | ||
| 152 | ktime_t rt_period; | ||
| 153 | u64 rt_runtime; | ||
| 154 | struct hrtimer rt_period_timer; | ||
| 155 | }; | ||
| 156 | |||
| 157 | static struct rt_bandwidth def_rt_bandwidth; | ||
| 158 | |||
| 159 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); | ||
| 160 | |||
| 161 | static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) | ||
| 162 | { | ||
| 163 | struct rt_bandwidth *rt_b = | ||
| 164 | container_of(timer, struct rt_bandwidth, rt_period_timer); | ||
| 165 | ktime_t now; | ||
| 166 | int overrun; | ||
| 167 | int idle = 0; | ||
| 168 | |||
| 169 | for (;;) { | ||
| 170 | now = hrtimer_cb_get_time(timer); | ||
| 171 | overrun = hrtimer_forward(timer, now, rt_b->rt_period); | ||
| 172 | |||
| 173 | if (!overrun) | ||
| 174 | break; | ||
| 175 | |||
| 176 | idle = do_sched_rt_period_timer(rt_b, overrun); | ||
| 177 | } | ||
| 178 | |||
| 179 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
| 180 | } | ||
| 181 | |||
| 182 | static | ||
| 183 | void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | ||
| 184 | { | ||
| 185 | rt_b->rt_period = ns_to_ktime(period); | ||
| 186 | rt_b->rt_runtime = runtime; | ||
| 187 | |||
| 188 | raw_spin_lock_init(&rt_b->rt_runtime_lock); | ||
| 189 | |||
| 190 | hrtimer_init(&rt_b->rt_period_timer, | ||
| 191 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 192 | rt_b->rt_period_timer.function = sched_rt_period_timer; | ||
| 193 | } | ||
| 194 | |||
| 195 | static inline int rt_bandwidth_enabled(void) | ||
| 196 | { | ||
| 197 | return sysctl_sched_rt_runtime >= 0; | ||
| 198 | } | ||
| 199 | |||
| 200 | static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) | ||
| 201 | { | 88 | { |
| 202 | unsigned long delta; | 89 | unsigned long delta; |
| 203 | ktime_t soft, hard, now; | 90 | ktime_t soft, hard, now; |
| @@ -217,580 +104,12 @@ static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) | |||
| 217 | } | 104 | } |
| 218 | } | 105 | } |
| 219 | 106 | ||
| 220 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | 107 | DEFINE_MUTEX(sched_domains_mutex); |
| 221 | { | 108 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
| 222 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | ||
| 223 | return; | ||
| 224 | |||
| 225 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
| 226 | return; | ||
| 227 | |||
| 228 | raw_spin_lock(&rt_b->rt_runtime_lock); | ||
| 229 | start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); | ||
| 230 | raw_spin_unlock(&rt_b->rt_runtime_lock); | ||
| 231 | } | ||
| 232 | |||
| 233 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 234 | static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
| 235 | { | ||
| 236 | hrtimer_cancel(&rt_b->rt_period_timer); | ||
| 237 | } | ||
| 238 | #endif | ||
| 239 | |||
| 240 | /* | ||
| 241 | * sched_domains_mutex serializes calls to init_sched_domains, | ||
| 242 | * detach_destroy_domains and partition_sched_domains. | ||
| 243 | */ | ||
| 244 | static DEFINE_MUTEX(sched_domains_mutex); | ||
| 245 | |||
| 246 | #ifdef CONFIG_CGROUP_SCHED | ||
| 247 | |||
| 248 | #include <linux/cgroup.h> | ||
| 249 | |||
| 250 | struct cfs_rq; | ||
| 251 | |||
| 252 | static LIST_HEAD(task_groups); | ||
| 253 | |||
| 254 | struct cfs_bandwidth { | ||
| 255 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 256 | raw_spinlock_t lock; | ||
| 257 | ktime_t period; | ||
| 258 | u64 quota, runtime; | ||
| 259 | s64 hierarchal_quota; | ||
| 260 | u64 runtime_expires; | ||
| 261 | |||
| 262 | int idle, timer_active; | ||
| 263 | struct hrtimer period_timer, slack_timer; | ||
| 264 | struct list_head throttled_cfs_rq; | ||
| 265 | |||
| 266 | /* statistics */ | ||
| 267 | int nr_periods, nr_throttled; | ||
| 268 | u64 throttled_time; | ||
| 269 | #endif | ||
| 270 | }; | ||
| 271 | |||
| 272 | /* task group related information */ | ||
| 273 | struct task_group { | ||
| 274 | struct cgroup_subsys_state css; | ||
| 275 | |||
| 276 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 277 | /* schedulable entities of this group on each cpu */ | ||
| 278 | struct sched_entity **se; | ||
| 279 | /* runqueue "owned" by this group on each cpu */ | ||
| 280 | struct cfs_rq **cfs_rq; | ||
| 281 | unsigned long shares; | ||
| 282 | |||
| 283 | atomic_t load_weight; | ||
| 284 | #endif | ||
| 285 | |||
| 286 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 287 | struct sched_rt_entity **rt_se; | ||
| 288 | struct rt_rq **rt_rq; | ||
| 289 | |||
| 290 | struct rt_bandwidth rt_bandwidth; | ||
| 291 | #endif | ||
| 292 | |||
| 293 | struct rcu_head rcu; | ||
| 294 | struct list_head list; | ||
| 295 | |||
| 296 | struct task_group *parent; | ||
| 297 | struct list_head siblings; | ||
| 298 | struct list_head children; | ||
| 299 | |||
| 300 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
| 301 | struct autogroup *autogroup; | ||
| 302 | #endif | ||
| 303 | |||
| 304 | struct cfs_bandwidth cfs_bandwidth; | ||
| 305 | }; | ||
| 306 | |||
| 307 | /* task_group_lock serializes the addition/removal of task groups */ | ||
| 308 | static DEFINE_SPINLOCK(task_group_lock); | ||
| 309 | |||
| 310 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 311 | |||
| 312 | # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD | ||
| 313 | |||
| 314 | /* | ||
| 315 | * A weight of 0 or 1 can cause arithmetics problems. | ||
| 316 | * A weight of a cfs_rq is the sum of weights of which entities | ||
| 317 | * are queued on this cfs_rq, so a weight of a entity should not be | ||
| 318 | * too large, so as the shares value of a task group. | ||
| 319 | * (The default weight is 1024 - so there's no practical | ||
| 320 | * limitation from this.) | ||
| 321 | */ | ||
| 322 | #define MIN_SHARES (1UL << 1) | ||
| 323 | #define MAX_SHARES (1UL << 18) | ||
| 324 | |||
| 325 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; | ||
| 326 | #endif | ||
| 327 | |||
| 328 | /* Default task group. | ||
| 329 | * Every task in system belong to this group at bootup. | ||
| 330 | */ | ||
| 331 | struct task_group root_task_group; | ||
| 332 | |||
| 333 | #endif /* CONFIG_CGROUP_SCHED */ | ||
| 334 | |||
| 335 | /* CFS-related fields in a runqueue */ | ||
| 336 | struct cfs_rq { | ||
| 337 | struct load_weight load; | ||
| 338 | unsigned long nr_running, h_nr_running; | ||
| 339 | |||
| 340 | u64 exec_clock; | ||
| 341 | u64 min_vruntime; | ||
| 342 | #ifndef CONFIG_64BIT | ||
| 343 | u64 min_vruntime_copy; | ||
| 344 | #endif | ||
| 345 | |||
| 346 | struct rb_root tasks_timeline; | ||
| 347 | struct rb_node *rb_leftmost; | ||
| 348 | |||
| 349 | struct list_head tasks; | ||
| 350 | struct list_head *balance_iterator; | ||
| 351 | |||
| 352 | /* | ||
| 353 | * 'curr' points to currently running entity on this cfs_rq. | ||
| 354 | * It is set to NULL otherwise (i.e when none are currently running). | ||
| 355 | */ | ||
| 356 | struct sched_entity *curr, *next, *last, *skip; | ||
| 357 | |||
| 358 | #ifdef CONFIG_SCHED_DEBUG | ||
| 359 | unsigned int nr_spread_over; | ||
| 360 | #endif | ||
| 361 | |||
| 362 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 363 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | ||
| 364 | |||
| 365 | /* | ||
| 366 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | ||
| 367 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | ||
| 368 | * (like users, containers etc.) | ||
| 369 | * | ||
| 370 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | ||
| 371 | * list is used during load balance. | ||
| 372 | */ | ||
| 373 | int on_list; | ||
| 374 | struct list_head leaf_cfs_rq_list; | ||
| 375 | struct task_group *tg; /* group that "owns" this runqueue */ | ||
| 376 | |||
| 377 | #ifdef CONFIG_SMP | ||
| 378 | /* | ||
| 379 | * the part of load.weight contributed by tasks | ||
| 380 | */ | ||
| 381 | unsigned long task_weight; | ||
| 382 | |||
| 383 | /* | ||
| 384 | * h_load = weight * f(tg) | ||
| 385 | * | ||
| 386 | * Where f(tg) is the recursive weight fraction assigned to | ||
| 387 | * this group. | ||
| 388 | */ | ||
| 389 | unsigned long h_load; | ||
| 390 | |||
| 391 | /* | ||
| 392 | * Maintaining per-cpu shares distribution for group scheduling | ||
| 393 | * | ||
| 394 | * load_stamp is the last time we updated the load average | ||
| 395 | * load_last is the last time we updated the load average and saw load | ||
| 396 | * load_unacc_exec_time is currently unaccounted execution time | ||
| 397 | */ | ||
| 398 | u64 load_avg; | ||
| 399 | u64 load_period; | ||
| 400 | u64 load_stamp, load_last, load_unacc_exec_time; | ||
| 401 | |||
| 402 | unsigned long load_contribution; | ||
| 403 | #endif | ||
| 404 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 405 | int runtime_enabled; | ||
| 406 | u64 runtime_expires; | ||
| 407 | s64 runtime_remaining; | ||
| 408 | |||
| 409 | u64 throttled_timestamp; | ||
| 410 | int throttled, throttle_count; | ||
| 411 | struct list_head throttled_list; | ||
| 412 | #endif | ||
| 413 | #endif | ||
| 414 | }; | ||
| 415 | |||
| 416 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 417 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 418 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
| 419 | { | ||
| 420 | return &tg->cfs_bandwidth; | ||
| 421 | } | ||
| 422 | |||
| 423 | static inline u64 default_cfs_period(void); | ||
| 424 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); | ||
| 425 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); | ||
| 426 | |||
| 427 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | ||
| 428 | { | ||
| 429 | struct cfs_bandwidth *cfs_b = | ||
| 430 | container_of(timer, struct cfs_bandwidth, slack_timer); | ||
| 431 | do_sched_cfs_slack_timer(cfs_b); | ||
| 432 | |||
| 433 | return HRTIMER_NORESTART; | ||
| 434 | } | ||
| 435 | |||
| 436 | static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | ||
| 437 | { | ||
| 438 | struct cfs_bandwidth *cfs_b = | ||
| 439 | container_of(timer, struct cfs_bandwidth, period_timer); | ||
| 440 | ktime_t now; | ||
| 441 | int overrun; | ||
| 442 | int idle = 0; | ||
| 443 | |||
| 444 | for (;;) { | ||
| 445 | now = hrtimer_cb_get_time(timer); | ||
| 446 | overrun = hrtimer_forward(timer, now, cfs_b->period); | ||
| 447 | |||
| 448 | if (!overrun) | ||
| 449 | break; | ||
| 450 | |||
| 451 | idle = do_sched_cfs_period_timer(cfs_b, overrun); | ||
| 452 | } | ||
| 453 | |||
| 454 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
| 455 | } | ||
| 456 | |||
| 457 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
| 458 | { | ||
| 459 | raw_spin_lock_init(&cfs_b->lock); | ||
| 460 | cfs_b->runtime = 0; | ||
| 461 | cfs_b->quota = RUNTIME_INF; | ||
| 462 | cfs_b->period = ns_to_ktime(default_cfs_period()); | ||
| 463 | |||
| 464 | INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); | ||
| 465 | hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 466 | cfs_b->period_timer.function = sched_cfs_period_timer; | ||
| 467 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 468 | cfs_b->slack_timer.function = sched_cfs_slack_timer; | ||
| 469 | } | ||
| 470 | |||
| 471 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
| 472 | { | ||
| 473 | cfs_rq->runtime_enabled = 0; | ||
| 474 | INIT_LIST_HEAD(&cfs_rq->throttled_list); | ||
| 475 | } | ||
| 476 | |||
| 477 | /* requires cfs_b->lock, may release to reprogram timer */ | ||
| 478 | static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
| 479 | { | ||
| 480 | /* | ||
| 481 | * The timer may be active because we're trying to set a new bandwidth | ||
| 482 | * period or because we're racing with the tear-down path | ||
| 483 | * (timer_active==0 becomes visible before the hrtimer call-back | ||
| 484 | * terminates). In either case we ensure that it's re-programmed | ||
| 485 | */ | ||
| 486 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | ||
| 487 | raw_spin_unlock(&cfs_b->lock); | ||
| 488 | /* ensure cfs_b->lock is available while we wait */ | ||
| 489 | hrtimer_cancel(&cfs_b->period_timer); | ||
| 490 | |||
| 491 | raw_spin_lock(&cfs_b->lock); | ||
| 492 | /* if someone else restarted the timer then we're done */ | ||
| 493 | if (cfs_b->timer_active) | ||
| 494 | return; | ||
| 495 | } | ||
| 496 | |||
| 497 | cfs_b->timer_active = 1; | ||
| 498 | start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); | ||
| 499 | } | ||
| 500 | |||
| 501 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
| 502 | { | ||
| 503 | hrtimer_cancel(&cfs_b->period_timer); | ||
| 504 | hrtimer_cancel(&cfs_b->slack_timer); | ||
| 505 | } | ||
| 506 | #else | ||
| 507 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
| 508 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
| 509 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
| 510 | |||
| 511 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
| 512 | { | ||
| 513 | return NULL; | ||
| 514 | } | ||
| 515 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
| 516 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 517 | |||
| 518 | /* Real-Time classes' related field in a runqueue: */ | ||
| 519 | struct rt_rq { | ||
| 520 | struct rt_prio_array active; | ||
| 521 | unsigned long rt_nr_running; | ||
| 522 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | ||
| 523 | struct { | ||
| 524 | int curr; /* highest queued rt task prio */ | ||
| 525 | #ifdef CONFIG_SMP | ||
| 526 | int next; /* next highest */ | ||
| 527 | #endif | ||
| 528 | } highest_prio; | ||
| 529 | #endif | ||
| 530 | #ifdef CONFIG_SMP | ||
| 531 | unsigned long rt_nr_migratory; | ||
| 532 | unsigned long rt_nr_total; | ||
| 533 | int overloaded; | ||
| 534 | struct plist_head pushable_tasks; | ||
| 535 | #endif | ||
| 536 | int rt_throttled; | ||
| 537 | u64 rt_time; | ||
| 538 | u64 rt_runtime; | ||
| 539 | /* Nests inside the rq lock: */ | ||
| 540 | raw_spinlock_t rt_runtime_lock; | ||
| 541 | |||
| 542 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 543 | unsigned long rt_nr_boosted; | ||
| 544 | |||
| 545 | struct rq *rq; | ||
| 546 | struct list_head leaf_rt_rq_list; | ||
| 547 | struct task_group *tg; | ||
| 548 | #endif | ||
| 549 | }; | ||
| 550 | |||
| 551 | #ifdef CONFIG_SMP | ||
| 552 | |||
| 553 | /* | ||
| 554 | * We add the notion of a root-domain which will be used to define per-domain | ||
| 555 | * variables. Each exclusive cpuset essentially defines an island domain by | ||
| 556 | * fully partitioning the member cpus from any other cpuset. Whenever a new | ||
| 557 | * exclusive cpuset is created, we also create and attach a new root-domain | ||
| 558 | * object. | ||
| 559 | * | ||
| 560 | */ | ||
| 561 | struct root_domain { | ||
| 562 | atomic_t refcount; | ||
| 563 | atomic_t rto_count; | ||
| 564 | struct rcu_head rcu; | ||
| 565 | cpumask_var_t span; | ||
| 566 | cpumask_var_t online; | ||
| 567 | |||
| 568 | /* | ||
| 569 | * The "RT overload" flag: it gets set if a CPU has more than | ||
| 570 | * one runnable RT task. | ||
| 571 | */ | ||
| 572 | cpumask_var_t rto_mask; | ||
| 573 | struct cpupri cpupri; | ||
| 574 | }; | ||
| 575 | |||
| 576 | /* | ||
| 577 | * By default the system creates a single root-domain with all cpus as | ||
| 578 | * members (mimicking the global state we have today). | ||
| 579 | */ | ||
| 580 | static struct root_domain def_root_domain; | ||
| 581 | |||
| 582 | #endif /* CONFIG_SMP */ | ||
| 583 | |||
| 584 | /* | ||
| 585 | * This is the main, per-CPU runqueue data structure. | ||
| 586 | * | ||
| 587 | * Locking rule: those places that want to lock multiple runqueues | ||
| 588 | * (such as the load balancing or the thread migration code), lock | ||
| 589 | * acquire operations must be ordered by ascending &runqueue. | ||
| 590 | */ | ||
| 591 | struct rq { | ||
| 592 | /* runqueue lock: */ | ||
| 593 | raw_spinlock_t lock; | ||
| 594 | |||
| 595 | /* | ||
| 596 | * nr_running and cpu_load should be in the same cacheline because | ||
| 597 | * remote CPUs use both these fields when doing load calculation. | ||
| 598 | */ | ||
| 599 | unsigned long nr_running; | ||
| 600 | #define CPU_LOAD_IDX_MAX 5 | ||
| 601 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | ||
| 602 | unsigned long last_load_update_tick; | ||
| 603 | #ifdef CONFIG_NO_HZ | ||
| 604 | u64 nohz_stamp; | ||
| 605 | unsigned char nohz_balance_kick; | ||
| 606 | #endif | ||
| 607 | int skip_clock_update; | ||
| 608 | |||
| 609 | /* capture load from *all* tasks on this cpu: */ | ||
| 610 | struct load_weight load; | ||
| 611 | unsigned long nr_load_updates; | ||
| 612 | u64 nr_switches; | ||
| 613 | |||
| 614 | struct cfs_rq cfs; | ||
| 615 | struct rt_rq rt; | ||
| 616 | |||
| 617 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 618 | /* list of leaf cfs_rq on this cpu: */ | ||
| 619 | struct list_head leaf_cfs_rq_list; | ||
| 620 | #endif | ||
| 621 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 622 | struct list_head leaf_rt_rq_list; | ||
| 623 | #endif | ||
| 624 | |||
| 625 | /* | ||
| 626 | * This is part of a global counter where only the total sum | ||
| 627 | * over all CPUs matters. A task can increase this counter on | ||
| 628 | * one CPU and if it got migrated afterwards it may decrease | ||
| 629 | * it on another CPU. Always updated under the runqueue lock: | ||
| 630 | */ | ||
| 631 | unsigned long nr_uninterruptible; | ||
| 632 | |||
| 633 | struct task_struct *curr, *idle, *stop; | ||
| 634 | unsigned long next_balance; | ||
| 635 | struct mm_struct *prev_mm; | ||
| 636 | |||
| 637 | u64 clock; | ||
| 638 | u64 clock_task; | ||
| 639 | |||
| 640 | atomic_t nr_iowait; | ||
| 641 | |||
| 642 | #ifdef CONFIG_SMP | ||
| 643 | struct root_domain *rd; | ||
| 644 | struct sched_domain *sd; | ||
| 645 | |||
| 646 | unsigned long cpu_power; | ||
| 647 | |||
| 648 | unsigned char idle_balance; | ||
| 649 | /* For active balancing */ | ||
| 650 | int post_schedule; | ||
| 651 | int active_balance; | ||
| 652 | int push_cpu; | ||
| 653 | struct cpu_stop_work active_balance_work; | ||
| 654 | /* cpu of this runqueue: */ | ||
| 655 | int cpu; | ||
| 656 | int online; | ||
| 657 | |||
| 658 | u64 rt_avg; | ||
| 659 | u64 age_stamp; | ||
| 660 | u64 idle_stamp; | ||
| 661 | u64 avg_idle; | ||
| 662 | #endif | ||
| 663 | |||
| 664 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 665 | u64 prev_irq_time; | ||
| 666 | #endif | ||
| 667 | #ifdef CONFIG_PARAVIRT | ||
| 668 | u64 prev_steal_time; | ||
| 669 | #endif | ||
| 670 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
| 671 | u64 prev_steal_time_rq; | ||
| 672 | #endif | ||
| 673 | |||
| 674 | /* calc_load related fields */ | ||
| 675 | unsigned long calc_load_update; | ||
| 676 | long calc_load_active; | ||
| 677 | |||
| 678 | #ifdef CONFIG_SCHED_HRTICK | ||
| 679 | #ifdef CONFIG_SMP | ||
| 680 | int hrtick_csd_pending; | ||
| 681 | struct call_single_data hrtick_csd; | ||
| 682 | #endif | ||
| 683 | struct hrtimer hrtick_timer; | ||
| 684 | #endif | ||
| 685 | |||
| 686 | #ifdef CONFIG_SCHEDSTATS | ||
| 687 | /* latency stats */ | ||
| 688 | struct sched_info rq_sched_info; | ||
| 689 | unsigned long long rq_cpu_time; | ||
| 690 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ | ||
| 691 | |||
| 692 | /* sys_sched_yield() stats */ | ||
| 693 | unsigned int yld_count; | ||
| 694 | |||
| 695 | /* schedule() stats */ | ||
| 696 | unsigned int sched_switch; | ||
| 697 | unsigned int sched_count; | ||
| 698 | unsigned int sched_goidle; | ||
| 699 | |||
| 700 | /* try_to_wake_up() stats */ | ||
| 701 | unsigned int ttwu_count; | ||
| 702 | unsigned int ttwu_local; | ||
| 703 | #endif | ||
| 704 | |||
| 705 | #ifdef CONFIG_SMP | ||
| 706 | struct llist_head wake_list; | ||
| 707 | #endif | ||
| 708 | }; | ||
| 709 | |||
| 710 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | ||
| 711 | |||
| 712 | |||
| 713 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); | ||
| 714 | |||
| 715 | static inline int cpu_of(struct rq *rq) | ||
| 716 | { | ||
| 717 | #ifdef CONFIG_SMP | ||
| 718 | return rq->cpu; | ||
| 719 | #else | ||
| 720 | return 0; | ||
| 721 | #endif | ||
| 722 | } | ||
| 723 | |||
| 724 | #define rcu_dereference_check_sched_domain(p) \ | ||
| 725 | rcu_dereference_check((p), \ | ||
| 726 | lockdep_is_held(&sched_domains_mutex)) | ||
| 727 | |||
| 728 | /* | ||
| 729 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | ||
| 730 | * See detach_destroy_domains: synchronize_sched for details. | ||
| 731 | * | ||
| 732 | * The domain tree of any CPU may only be accessed from within | ||
| 733 | * preempt-disabled sections. | ||
| 734 | */ | ||
| 735 | #define for_each_domain(cpu, __sd) \ | ||
| 736 | for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) | ||
| 737 | |||
| 738 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | ||
| 739 | #define this_rq() (&__get_cpu_var(runqueues)) | ||
| 740 | #define task_rq(p) cpu_rq(task_cpu(p)) | ||
| 741 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | ||
| 742 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
| 743 | |||
| 744 | #ifdef CONFIG_CGROUP_SCHED | ||
| 745 | |||
| 746 | /* | ||
| 747 | * Return the group to which this tasks belongs. | ||
| 748 | * | ||
| 749 | * We use task_subsys_state_check() and extend the RCU verification with | ||
| 750 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each | ||
| 751 | * task it moves into the cgroup. Therefore by holding either of those locks, | ||
| 752 | * we pin the task to the current cgroup. | ||
| 753 | */ | ||
| 754 | static inline struct task_group *task_group(struct task_struct *p) | ||
| 755 | { | ||
| 756 | struct task_group *tg; | ||
| 757 | struct cgroup_subsys_state *css; | ||
| 758 | |||
| 759 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | ||
| 760 | lockdep_is_held(&p->pi_lock) || | ||
| 761 | lockdep_is_held(&task_rq(p)->lock)); | ||
| 762 | tg = container_of(css, struct task_group, css); | ||
| 763 | |||
| 764 | return autogroup_task_group(p, tg); | ||
| 765 | } | ||
| 766 | |||
| 767 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | ||
| 768 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | ||
| 769 | { | ||
| 770 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 771 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; | ||
| 772 | p->se.parent = task_group(p)->se[cpu]; | ||
| 773 | #endif | ||
| 774 | |||
| 775 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 776 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; | ||
| 777 | p->rt.parent = task_group(p)->rt_se[cpu]; | ||
| 778 | #endif | ||
| 779 | } | ||
| 780 | |||
| 781 | #else /* CONFIG_CGROUP_SCHED */ | ||
| 782 | |||
| 783 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | ||
| 784 | static inline struct task_group *task_group(struct task_struct *p) | ||
| 785 | { | ||
| 786 | return NULL; | ||
| 787 | } | ||
| 788 | |||
| 789 | #endif /* CONFIG_CGROUP_SCHED */ | ||
| 790 | 109 | ||
| 791 | static void update_rq_clock_task(struct rq *rq, s64 delta); | 110 | static void update_rq_clock_task(struct rq *rq, s64 delta); |
| 792 | 111 | ||
| 793 | static void update_rq_clock(struct rq *rq) | 112 | void update_rq_clock(struct rq *rq) |
| 794 | { | 113 | { |
| 795 | s64 delta; | 114 | s64 delta; |
| 796 | 115 | ||
| @@ -803,44 +122,14 @@ static void update_rq_clock(struct rq *rq) | |||
| 803 | } | 122 | } |
| 804 | 123 | ||
| 805 | /* | 124 | /* |
| 806 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | ||
| 807 | */ | ||
| 808 | #ifdef CONFIG_SCHED_DEBUG | ||
| 809 | # define const_debug __read_mostly | ||
| 810 | #else | ||
| 811 | # define const_debug static const | ||
| 812 | #endif | ||
| 813 | |||
| 814 | /** | ||
| 815 | * runqueue_is_locked - Returns true if the current cpu runqueue is locked | ||
| 816 | * @cpu: the processor in question. | ||
| 817 | * | ||
| 818 | * This interface allows printk to be called with the runqueue lock | ||
| 819 | * held and know whether or not it is OK to wake up the klogd. | ||
| 820 | */ | ||
| 821 | int runqueue_is_locked(int cpu) | ||
| 822 | { | ||
| 823 | return raw_spin_is_locked(&cpu_rq(cpu)->lock); | ||
| 824 | } | ||
| 825 | |||
| 826 | /* | ||
| 827 | * Debugging: various feature bits | 125 | * Debugging: various feature bits |
| 828 | */ | 126 | */ |
| 829 | 127 | ||
| 830 | #define SCHED_FEAT(name, enabled) \ | 128 | #define SCHED_FEAT(name, enabled) \ |
| 831 | __SCHED_FEAT_##name , | ||
| 832 | |||
| 833 | enum { | ||
| 834 | #include "sched_features.h" | ||
| 835 | }; | ||
| 836 | |||
| 837 | #undef SCHED_FEAT | ||
| 838 | |||
| 839 | #define SCHED_FEAT(name, enabled) \ | ||
| 840 | (1UL << __SCHED_FEAT_##name) * enabled | | 129 | (1UL << __SCHED_FEAT_##name) * enabled | |
| 841 | 130 | ||
| 842 | const_debug unsigned int sysctl_sched_features = | 131 | const_debug unsigned int sysctl_sched_features = |
| 843 | #include "sched_features.h" | 132 | #include "features.h" |
| 844 | 0; | 133 | 0; |
| 845 | 134 | ||
| 846 | #undef SCHED_FEAT | 135 | #undef SCHED_FEAT |
| @@ -850,7 +139,7 @@ const_debug unsigned int sysctl_sched_features = | |||
| 850 | #name , | 139 | #name , |
| 851 | 140 | ||
| 852 | static __read_mostly char *sched_feat_names[] = { | 141 | static __read_mostly char *sched_feat_names[] = { |
| 853 | #include "sched_features.h" | 142 | #include "features.h" |
| 854 | NULL | 143 | NULL |
| 855 | }; | 144 | }; |
| 856 | 145 | ||
| @@ -860,7 +149,7 @@ static int sched_feat_show(struct seq_file *m, void *v) | |||
| 860 | { | 149 | { |
| 861 | int i; | 150 | int i; |
| 862 | 151 | ||
| 863 | for (i = 0; sched_feat_names[i]; i++) { | 152 | for (i = 0; i < __SCHED_FEAT_NR; i++) { |
| 864 | if (!(sysctl_sched_features & (1UL << i))) | 153 | if (!(sysctl_sched_features & (1UL << i))) |
| 865 | seq_puts(m, "NO_"); | 154 | seq_puts(m, "NO_"); |
| 866 | seq_printf(m, "%s ", sched_feat_names[i]); | 155 | seq_printf(m, "%s ", sched_feat_names[i]); |
| @@ -870,6 +159,36 @@ static int sched_feat_show(struct seq_file *m, void *v) | |||
| 870 | return 0; | 159 | return 0; |
| 871 | } | 160 | } |
| 872 | 161 | ||
| 162 | #ifdef HAVE_JUMP_LABEL | ||
| 163 | |||
| 164 | #define jump_label_key__true jump_label_key_enabled | ||
| 165 | #define jump_label_key__false jump_label_key_disabled | ||
| 166 | |||
| 167 | #define SCHED_FEAT(name, enabled) \ | ||
| 168 | jump_label_key__##enabled , | ||
| 169 | |||
| 170 | struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { | ||
| 171 | #include "features.h" | ||
| 172 | }; | ||
| 173 | |||
| 174 | #undef SCHED_FEAT | ||
| 175 | |||
| 176 | static void sched_feat_disable(int i) | ||
| 177 | { | ||
| 178 | if (jump_label_enabled(&sched_feat_keys[i])) | ||
| 179 | jump_label_dec(&sched_feat_keys[i]); | ||
| 180 | } | ||
| 181 | |||
| 182 | static void sched_feat_enable(int i) | ||
| 183 | { | ||
| 184 | if (!jump_label_enabled(&sched_feat_keys[i])) | ||
| 185 | jump_label_inc(&sched_feat_keys[i]); | ||
| 186 | } | ||
| 187 | #else | ||
| 188 | static void sched_feat_disable(int i) { }; | ||
| 189 | static void sched_feat_enable(int i) { }; | ||
| 190 | #endif /* HAVE_JUMP_LABEL */ | ||
| 191 | |||
| 873 | static ssize_t | 192 | static ssize_t |
| 874 | sched_feat_write(struct file *filp, const char __user *ubuf, | 193 | sched_feat_write(struct file *filp, const char __user *ubuf, |
| 875 | size_t cnt, loff_t *ppos) | 194 | size_t cnt, loff_t *ppos) |
| @@ -893,17 +212,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
| 893 | cmp += 3; | 212 | cmp += 3; |
| 894 | } | 213 | } |
| 895 | 214 | ||
| 896 | for (i = 0; sched_feat_names[i]; i++) { | 215 | for (i = 0; i < __SCHED_FEAT_NR; i++) { |
| 897 | if (strcmp(cmp, sched_feat_names[i]) == 0) { | 216 | if (strcmp(cmp, sched_feat_names[i]) == 0) { |
| 898 | if (neg) | 217 | if (neg) { |
| 899 | sysctl_sched_features &= ~(1UL << i); | 218 | sysctl_sched_features &= ~(1UL << i); |
| 900 | else | 219 | sched_feat_disable(i); |
| 220 | } else { | ||
| 901 | sysctl_sched_features |= (1UL << i); | 221 | sysctl_sched_features |= (1UL << i); |
| 222 | sched_feat_enable(i); | ||
| 223 | } | ||
| 902 | break; | 224 | break; |
| 903 | } | 225 | } |
| 904 | } | 226 | } |
| 905 | 227 | ||
| 906 | if (!sched_feat_names[i]) | 228 | if (i == __SCHED_FEAT_NR) |
| 907 | return -EINVAL; | 229 | return -EINVAL; |
| 908 | 230 | ||
| 909 | *ppos += cnt; | 231 | *ppos += cnt; |
| @@ -932,10 +254,7 @@ static __init int sched_init_debug(void) | |||
| 932 | return 0; | 254 | return 0; |
| 933 | } | 255 | } |
| 934 | late_initcall(sched_init_debug); | 256 | late_initcall(sched_init_debug); |
| 935 | 257 | #endif /* CONFIG_SCHED_DEBUG */ | |
| 936 | #endif | ||
| 937 | |||
| 938 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | ||
| 939 | 258 | ||
| 940 | /* | 259 | /* |
| 941 | * Number of tasks to iterate in a single balance run. | 260 | * Number of tasks to iterate in a single balance run. |
| @@ -957,7 +276,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | |||
| 957 | */ | 276 | */ |
| 958 | unsigned int sysctl_sched_rt_period = 1000000; | 277 | unsigned int sysctl_sched_rt_period = 1000000; |
| 959 | 278 | ||
| 960 | static __read_mostly int scheduler_running; | 279 | __read_mostly int scheduler_running; |
| 961 | 280 | ||
| 962 | /* | 281 | /* |
| 963 | * part of the period that we allow rt tasks to run in us. | 282 | * part of the period that we allow rt tasks to run in us. |
| @@ -965,112 +284,7 @@ static __read_mostly int scheduler_running; | |||
| 965 | */ | 284 | */ |
| 966 | int sysctl_sched_rt_runtime = 950000; | 285 | int sysctl_sched_rt_runtime = 950000; |
| 967 | 286 | ||
| 968 | static inline u64 global_rt_period(void) | ||
| 969 | { | ||
| 970 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | ||
| 971 | } | ||
| 972 | |||
| 973 | static inline u64 global_rt_runtime(void) | ||
| 974 | { | ||
| 975 | if (sysctl_sched_rt_runtime < 0) | ||
| 976 | return RUNTIME_INF; | ||
| 977 | |||
| 978 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | ||
| 979 | } | ||
| 980 | |||
| 981 | #ifndef prepare_arch_switch | ||
| 982 | # define prepare_arch_switch(next) do { } while (0) | ||
| 983 | #endif | ||
| 984 | #ifndef finish_arch_switch | ||
| 985 | # define finish_arch_switch(prev) do { } while (0) | ||
| 986 | #endif | ||
| 987 | |||
| 988 | static inline int task_current(struct rq *rq, struct task_struct *p) | ||
| 989 | { | ||
| 990 | return rq->curr == p; | ||
| 991 | } | ||
| 992 | |||
| 993 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
| 994 | { | ||
| 995 | #ifdef CONFIG_SMP | ||
| 996 | return p->on_cpu; | ||
| 997 | #else | ||
| 998 | return task_current(rq, p); | ||
| 999 | #endif | ||
| 1000 | } | ||
| 1001 | |||
| 1002 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 1003 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
| 1004 | { | ||
| 1005 | #ifdef CONFIG_SMP | ||
| 1006 | /* | ||
| 1007 | * We can optimise this out completely for !SMP, because the | ||
| 1008 | * SMP rebalancing from interrupt is the only thing that cares | ||
| 1009 | * here. | ||
| 1010 | */ | ||
| 1011 | next->on_cpu = 1; | ||
| 1012 | #endif | ||
| 1013 | } | ||
| 1014 | |||
| 1015 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
| 1016 | { | ||
| 1017 | #ifdef CONFIG_SMP | ||
| 1018 | /* | ||
| 1019 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
| 1020 | * We must ensure this doesn't happen until the switch is completely | ||
| 1021 | * finished. | ||
| 1022 | */ | ||
| 1023 | smp_wmb(); | ||
| 1024 | prev->on_cpu = 0; | ||
| 1025 | #endif | ||
| 1026 | #ifdef CONFIG_DEBUG_SPINLOCK | ||
| 1027 | /* this is a valid case when another task releases the spinlock */ | ||
| 1028 | rq->lock.owner = current; | ||
| 1029 | #endif | ||
| 1030 | /* | ||
| 1031 | * If we are tracking spinlock dependencies then we have to | ||
| 1032 | * fix up the runqueue lock - which gets 'carried over' from | ||
| 1033 | * prev into current: | ||
| 1034 | */ | ||
| 1035 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | ||
| 1036 | |||
| 1037 | raw_spin_unlock_irq(&rq->lock); | ||
| 1038 | } | ||
| 1039 | 287 | ||
| 1040 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
| 1041 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
| 1042 | { | ||
| 1043 | #ifdef CONFIG_SMP | ||
| 1044 | /* | ||
| 1045 | * We can optimise this out completely for !SMP, because the | ||
| 1046 | * SMP rebalancing from interrupt is the only thing that cares | ||
| 1047 | * here. | ||
| 1048 | */ | ||
| 1049 | next->on_cpu = 1; | ||
| 1050 | #endif | ||
| 1051 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1052 | raw_spin_unlock_irq(&rq->lock); | ||
| 1053 | #else | ||
| 1054 | raw_spin_unlock(&rq->lock); | ||
| 1055 | #endif | ||
| 1056 | } | ||
| 1057 | |||
| 1058 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
| 1059 | { | ||
| 1060 | #ifdef CONFIG_SMP | ||
| 1061 | /* | ||
| 1062 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
| 1063 | * We must ensure this doesn't happen until the switch is completely | ||
| 1064 | * finished. | ||
| 1065 | */ | ||
| 1066 | smp_wmb(); | ||
| 1067 | prev->on_cpu = 0; | ||
| 1068 | #endif | ||
| 1069 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1070 | local_irq_enable(); | ||
| 1071 | #endif | ||
| 1072 | } | ||
| 1073 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
| 1074 | 288 | ||
| 1075 | /* | 289 | /* |
| 1076 | * __task_rq_lock - lock the rq @p resides on. | 290 | * __task_rq_lock - lock the rq @p resides on. |
| @@ -1153,20 +367,6 @@ static struct rq *this_rq_lock(void) | |||
| 1153 | * rq->lock. | 367 | * rq->lock. |
| 1154 | */ | 368 | */ |
| 1155 | 369 | ||
| 1156 | /* | ||
| 1157 | * Use hrtick when: | ||
| 1158 | * - enabled by features | ||
| 1159 | * - hrtimer is actually high res | ||
| 1160 | */ | ||
| 1161 | static inline int hrtick_enabled(struct rq *rq) | ||
| 1162 | { | ||
| 1163 | if (!sched_feat(HRTICK)) | ||
| 1164 | return 0; | ||
| 1165 | if (!cpu_active(cpu_of(rq))) | ||
| 1166 | return 0; | ||
| 1167 | return hrtimer_is_hres_active(&rq->hrtick_timer); | ||
| 1168 | } | ||
| 1169 | |||
| 1170 | static void hrtick_clear(struct rq *rq) | 370 | static void hrtick_clear(struct rq *rq) |
| 1171 | { | 371 | { |
| 1172 | if (hrtimer_active(&rq->hrtick_timer)) | 372 | if (hrtimer_active(&rq->hrtick_timer)) |
| @@ -1210,7 +410,7 @@ static void __hrtick_start(void *arg) | |||
| 1210 | * | 410 | * |
| 1211 | * called with rq->lock held and irqs disabled | 411 | * called with rq->lock held and irqs disabled |
| 1212 | */ | 412 | */ |
| 1213 | static void hrtick_start(struct rq *rq, u64 delay) | 413 | void hrtick_start(struct rq *rq, u64 delay) |
| 1214 | { | 414 | { |
| 1215 | struct hrtimer *timer = &rq->hrtick_timer; | 415 | struct hrtimer *timer = &rq->hrtick_timer; |
| 1216 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); | 416 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); |
| @@ -1254,7 +454,7 @@ static __init void init_hrtick(void) | |||
| 1254 | * | 454 | * |
| 1255 | * called with rq->lock held and irqs disabled | 455 | * called with rq->lock held and irqs disabled |
| 1256 | */ | 456 | */ |
| 1257 | static void hrtick_start(struct rq *rq, u64 delay) | 457 | void hrtick_start(struct rq *rq, u64 delay) |
| 1258 | { | 458 | { |
| 1259 | __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, | 459 | __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, |
| 1260 | HRTIMER_MODE_REL_PINNED, 0); | 460 | HRTIMER_MODE_REL_PINNED, 0); |
| @@ -1305,7 +505,7 @@ static inline void init_hrtick(void) | |||
| 1305 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 505 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) |
| 1306 | #endif | 506 | #endif |
| 1307 | 507 | ||
| 1308 | static void resched_task(struct task_struct *p) | 508 | void resched_task(struct task_struct *p) |
| 1309 | { | 509 | { |
| 1310 | int cpu; | 510 | int cpu; |
| 1311 | 511 | ||
| @@ -1326,7 +526,7 @@ static void resched_task(struct task_struct *p) | |||
| 1326 | smp_send_reschedule(cpu); | 526 | smp_send_reschedule(cpu); |
| 1327 | } | 527 | } |
| 1328 | 528 | ||
| 1329 | static void resched_cpu(int cpu) | 529 | void resched_cpu(int cpu) |
| 1330 | { | 530 | { |
| 1331 | struct rq *rq = cpu_rq(cpu); | 531 | struct rq *rq = cpu_rq(cpu); |
| 1332 | unsigned long flags; | 532 | unsigned long flags; |
| @@ -1407,7 +607,8 @@ void wake_up_idle_cpu(int cpu) | |||
| 1407 | 607 | ||
| 1408 | static inline bool got_nohz_idle_kick(void) | 608 | static inline bool got_nohz_idle_kick(void) |
| 1409 | { | 609 | { |
| 1410 | return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; | 610 | int cpu = smp_processor_id(); |
| 611 | return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); | ||
| 1411 | } | 612 | } |
| 1412 | 613 | ||
| 1413 | #else /* CONFIG_NO_HZ */ | 614 | #else /* CONFIG_NO_HZ */ |
| @@ -1419,12 +620,7 @@ static inline bool got_nohz_idle_kick(void) | |||
| 1419 | 620 | ||
| 1420 | #endif /* CONFIG_NO_HZ */ | 621 | #endif /* CONFIG_NO_HZ */ |
| 1421 | 622 | ||
| 1422 | static u64 sched_avg_period(void) | 623 | void sched_avg_update(struct rq *rq) |
| 1423 | { | ||
| 1424 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | ||
| 1425 | } | ||
| 1426 | |||
| 1427 | static void sched_avg_update(struct rq *rq) | ||
| 1428 | { | 624 | { |
| 1429 | s64 period = sched_avg_period(); | 625 | s64 period = sched_avg_period(); |
| 1430 | 626 | ||
| @@ -1440,193 +636,23 @@ static void sched_avg_update(struct rq *rq) | |||
| 1440 | } | 636 | } |
| 1441 | } | 637 | } |
| 1442 | 638 | ||
| 1443 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
| 1444 | { | ||
| 1445 | rq->rt_avg += rt_delta; | ||
| 1446 | sched_avg_update(rq); | ||
| 1447 | } | ||
| 1448 | |||
| 1449 | #else /* !CONFIG_SMP */ | 639 | #else /* !CONFIG_SMP */ |
| 1450 | static void resched_task(struct task_struct *p) | 640 | void resched_task(struct task_struct *p) |
| 1451 | { | 641 | { |
| 1452 | assert_raw_spin_locked(&task_rq(p)->lock); | 642 | assert_raw_spin_locked(&task_rq(p)->lock); |
| 1453 | set_tsk_need_resched(p); | 643 | set_tsk_need_resched(p); |
| 1454 | } | 644 | } |
| 1455 | |||
| 1456 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
| 1457 | { | ||
| 1458 | } | ||
| 1459 | |||
| 1460 | static void sched_avg_update(struct rq *rq) | ||
| 1461 | { | ||
| 1462 | } | ||
| 1463 | #endif /* CONFIG_SMP */ | 645 | #endif /* CONFIG_SMP */ |
| 1464 | 646 | ||
| 1465 | #if BITS_PER_LONG == 32 | ||
| 1466 | # define WMULT_CONST (~0UL) | ||
| 1467 | #else | ||
| 1468 | # define WMULT_CONST (1UL << 32) | ||
| 1469 | #endif | ||
| 1470 | |||
| 1471 | #define WMULT_SHIFT 32 | ||
| 1472 | |||
| 1473 | /* | ||
| 1474 | * Shift right and round: | ||
| 1475 | */ | ||
| 1476 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | ||
| 1477 | |||
| 1478 | /* | ||
| 1479 | * delta *= weight / lw | ||
| 1480 | */ | ||
| 1481 | static unsigned long | ||
| 1482 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | ||
| 1483 | struct load_weight *lw) | ||
| 1484 | { | ||
| 1485 | u64 tmp; | ||
| 1486 | |||
| 1487 | /* | ||
| 1488 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | ||
| 1489 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | ||
| 1490 | * 2^SCHED_LOAD_RESOLUTION. | ||
| 1491 | */ | ||
| 1492 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | ||
| 1493 | tmp = (u64)delta_exec * scale_load_down(weight); | ||
| 1494 | else | ||
| 1495 | tmp = (u64)delta_exec; | ||
| 1496 | |||
| 1497 | if (!lw->inv_weight) { | ||
| 1498 | unsigned long w = scale_load_down(lw->weight); | ||
| 1499 | |||
| 1500 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | ||
| 1501 | lw->inv_weight = 1; | ||
| 1502 | else if (unlikely(!w)) | ||
| 1503 | lw->inv_weight = WMULT_CONST; | ||
| 1504 | else | ||
| 1505 | lw->inv_weight = WMULT_CONST / w; | ||
| 1506 | } | ||
| 1507 | |||
| 1508 | /* | ||
| 1509 | * Check whether we'd overflow the 64-bit multiplication: | ||
| 1510 | */ | ||
| 1511 | if (unlikely(tmp > WMULT_CONST)) | ||
| 1512 | tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, | ||
| 1513 | WMULT_SHIFT/2); | ||
| 1514 | else | ||
| 1515 | tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); | ||
| 1516 | |||
| 1517 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | ||
| 1518 | } | ||
| 1519 | |||
| 1520 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | ||
| 1521 | { | ||
| 1522 | lw->weight += inc; | ||
| 1523 | lw->inv_weight = 0; | ||
| 1524 | } | ||
| 1525 | |||
| 1526 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | ||
| 1527 | { | ||
| 1528 | lw->weight -= dec; | ||
| 1529 | lw->inv_weight = 0; | ||
| 1530 | } | ||
| 1531 | |||
| 1532 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
| 1533 | { | ||
| 1534 | lw->weight = w; | ||
| 1535 | lw->inv_weight = 0; | ||
| 1536 | } | ||
| 1537 | |||
| 1538 | /* | ||
| 1539 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
| 1540 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
| 1541 | * each task makes to its run queue's load is weighted according to its | ||
| 1542 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | ||
| 1543 | * scaled version of the new time slice allocation that they receive on time | ||
| 1544 | * slice expiry etc. | ||
| 1545 | */ | ||
| 1546 | |||
| 1547 | #define WEIGHT_IDLEPRIO 3 | ||
| 1548 | #define WMULT_IDLEPRIO 1431655765 | ||
| 1549 | |||
| 1550 | /* | ||
| 1551 | * Nice levels are multiplicative, with a gentle 10% change for every | ||
| 1552 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to | ||
| 1553 | * nice 1, it will get ~10% less CPU time than another CPU-bound task | ||
| 1554 | * that remained on nice 0. | ||
| 1555 | * | ||
| 1556 | * The "10% effect" is relative and cumulative: from _any_ nice level, | ||
| 1557 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level | ||
| 1558 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. | ||
| 1559 | * If a task goes up by ~10% and another task goes down by ~10% then | ||
| 1560 | * the relative distance between them is ~25%.) | ||
| 1561 | */ | ||
| 1562 | static const int prio_to_weight[40] = { | ||
| 1563 | /* -20 */ 88761, 71755, 56483, 46273, 36291, | ||
| 1564 | /* -15 */ 29154, 23254, 18705, 14949, 11916, | ||
| 1565 | /* -10 */ 9548, 7620, 6100, 4904, 3906, | ||
| 1566 | /* -5 */ 3121, 2501, 1991, 1586, 1277, | ||
| 1567 | /* 0 */ 1024, 820, 655, 526, 423, | ||
| 1568 | /* 5 */ 335, 272, 215, 172, 137, | ||
| 1569 | /* 10 */ 110, 87, 70, 56, 45, | ||
| 1570 | /* 15 */ 36, 29, 23, 18, 15, | ||
| 1571 | }; | ||
| 1572 | |||
| 1573 | /* | ||
| 1574 | * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. | ||
| 1575 | * | ||
| 1576 | * In cases where the weight does not change often, we can use the | ||
| 1577 | * precalculated inverse to speed up arithmetics by turning divisions | ||
| 1578 | * into multiplications: | ||
| 1579 | */ | ||
| 1580 | static const u32 prio_to_wmult[40] = { | ||
| 1581 | /* -20 */ 48388, 59856, 76040, 92818, 118348, | ||
| 1582 | /* -15 */ 147320, 184698, 229616, 287308, 360437, | ||
| 1583 | /* -10 */ 449829, 563644, 704093, 875809, 1099582, | ||
| 1584 | /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, | ||
| 1585 | /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, | ||
| 1586 | /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, | ||
| 1587 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, | ||
| 1588 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | ||
| 1589 | }; | ||
| 1590 | |||
| 1591 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | ||
| 1592 | enum cpuacct_stat_index { | ||
| 1593 | CPUACCT_STAT_USER, /* ... user mode */ | ||
| 1594 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | ||
| 1595 | |||
| 1596 | CPUACCT_STAT_NSTATS, | ||
| 1597 | }; | ||
| 1598 | |||
| 1599 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 1600 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | ||
| 1601 | static void cpuacct_update_stats(struct task_struct *tsk, | ||
| 1602 | enum cpuacct_stat_index idx, cputime_t val); | ||
| 1603 | #else | ||
| 1604 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | ||
| 1605 | static inline void cpuacct_update_stats(struct task_struct *tsk, | ||
| 1606 | enum cpuacct_stat_index idx, cputime_t val) {} | ||
| 1607 | #endif | ||
| 1608 | |||
| 1609 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
| 1610 | { | ||
| 1611 | update_load_add(&rq->load, load); | ||
| 1612 | } | ||
| 1613 | |||
| 1614 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
| 1615 | { | ||
| 1616 | update_load_sub(&rq->load, load); | ||
| 1617 | } | ||
| 1618 | |||
| 1619 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ | 647 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ |
| 1620 | (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) | 648 | (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) |
| 1621 | typedef int (*tg_visitor)(struct task_group *, void *); | ||
| 1622 | |||
| 1623 | /* | 649 | /* |
| 1624 | * Iterate task_group tree rooted at *from, calling @down when first entering a | 650 | * Iterate task_group tree rooted at *from, calling @down when first entering a |
| 1625 | * node and @up when leaving it for the final time. | 651 | * node and @up when leaving it for the final time. |
| 1626 | * | 652 | * |
| 1627 | * Caller must hold rcu_lock or sufficient equivalent. | 653 | * Caller must hold rcu_lock or sufficient equivalent. |
| 1628 | */ | 654 | */ |
| 1629 | static int walk_tg_tree_from(struct task_group *from, | 655 | int walk_tg_tree_from(struct task_group *from, |
| 1630 | tg_visitor down, tg_visitor up, void *data) | 656 | tg_visitor down, tg_visitor up, void *data) |
| 1631 | { | 657 | { |
| 1632 | struct task_group *parent, *child; | 658 | struct task_group *parent, *child; |
| @@ -1657,270 +683,13 @@ out: | |||
| 1657 | return ret; | 683 | return ret; |
| 1658 | } | 684 | } |
| 1659 | 685 | ||
| 1660 | /* | 686 | int tg_nop(struct task_group *tg, void *data) |
| 1661 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
| 1662 | * leaving it for the final time. | ||
| 1663 | * | ||
| 1664 | * Caller must hold rcu_lock or sufficient equivalent. | ||
| 1665 | */ | ||
| 1666 | |||
| 1667 | static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | ||
| 1668 | { | ||
| 1669 | return walk_tg_tree_from(&root_task_group, down, up, data); | ||
| 1670 | } | ||
| 1671 | |||
| 1672 | static int tg_nop(struct task_group *tg, void *data) | ||
| 1673 | { | ||
| 1674 | return 0; | ||
| 1675 | } | ||
| 1676 | #endif | ||
| 1677 | |||
| 1678 | #ifdef CONFIG_SMP | ||
| 1679 | /* Used instead of source_load when we know the type == 0 */ | ||
| 1680 | static unsigned long weighted_cpuload(const int cpu) | ||
| 1681 | { | ||
| 1682 | return cpu_rq(cpu)->load.weight; | ||
| 1683 | } | ||
| 1684 | |||
| 1685 | /* | ||
| 1686 | * Return a low guess at the load of a migration-source cpu weighted | ||
| 1687 | * according to the scheduling class and "nice" value. | ||
| 1688 | * | ||
| 1689 | * We want to under-estimate the load of migration sources, to | ||
| 1690 | * balance conservatively. | ||
| 1691 | */ | ||
| 1692 | static unsigned long source_load(int cpu, int type) | ||
| 1693 | { | ||
| 1694 | struct rq *rq = cpu_rq(cpu); | ||
| 1695 | unsigned long total = weighted_cpuload(cpu); | ||
| 1696 | |||
| 1697 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
| 1698 | return total; | ||
| 1699 | |||
| 1700 | return min(rq->cpu_load[type-1], total); | ||
| 1701 | } | ||
| 1702 | |||
| 1703 | /* | ||
| 1704 | * Return a high guess at the load of a migration-target cpu weighted | ||
| 1705 | * according to the scheduling class and "nice" value. | ||
| 1706 | */ | ||
| 1707 | static unsigned long target_load(int cpu, int type) | ||
| 1708 | { | ||
| 1709 | struct rq *rq = cpu_rq(cpu); | ||
| 1710 | unsigned long total = weighted_cpuload(cpu); | ||
| 1711 | |||
| 1712 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
| 1713 | return total; | ||
| 1714 | |||
| 1715 | return max(rq->cpu_load[type-1], total); | ||
| 1716 | } | ||
| 1717 | |||
| 1718 | static unsigned long power_of(int cpu) | ||
| 1719 | { | ||
| 1720 | return cpu_rq(cpu)->cpu_power; | ||
| 1721 | } | ||
| 1722 | |||
| 1723 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
| 1724 | |||
| 1725 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
| 1726 | { | 687 | { |
| 1727 | struct rq *rq = cpu_rq(cpu); | ||
| 1728 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | ||
| 1729 | |||
| 1730 | if (nr_running) | ||
| 1731 | return rq->load.weight / nr_running; | ||
| 1732 | |||
| 1733 | return 0; | 688 | return 0; |
| 1734 | } | 689 | } |
| 1735 | |||
| 1736 | #ifdef CONFIG_PREEMPT | ||
| 1737 | |||
| 1738 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | ||
| 1739 | |||
| 1740 | /* | ||
| 1741 | * fair double_lock_balance: Safely acquires both rq->locks in a fair | ||
| 1742 | * way at the expense of forcing extra atomic operations in all | ||
| 1743 | * invocations. This assures that the double_lock is acquired using the | ||
| 1744 | * same underlying policy as the spinlock_t on this architecture, which | ||
| 1745 | * reduces latency compared to the unfair variant below. However, it | ||
| 1746 | * also adds more overhead and therefore may reduce throughput. | ||
| 1747 | */ | ||
| 1748 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
| 1749 | __releases(this_rq->lock) | ||
| 1750 | __acquires(busiest->lock) | ||
| 1751 | __acquires(this_rq->lock) | ||
| 1752 | { | ||
| 1753 | raw_spin_unlock(&this_rq->lock); | ||
| 1754 | double_rq_lock(this_rq, busiest); | ||
| 1755 | |||
| 1756 | return 1; | ||
| 1757 | } | ||
| 1758 | |||
| 1759 | #else | ||
| 1760 | /* | ||
| 1761 | * Unfair double_lock_balance: Optimizes throughput at the expense of | ||
| 1762 | * latency by eliminating extra atomic operations when the locks are | ||
| 1763 | * already in proper order on entry. This favors lower cpu-ids and will | ||
| 1764 | * grant the double lock to lower cpus over higher ids under contention, | ||
| 1765 | * regardless of entry order into the function. | ||
| 1766 | */ | ||
| 1767 | static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
| 1768 | __releases(this_rq->lock) | ||
| 1769 | __acquires(busiest->lock) | ||
| 1770 | __acquires(this_rq->lock) | ||
| 1771 | { | ||
| 1772 | int ret = 0; | ||
| 1773 | |||
| 1774 | if (unlikely(!raw_spin_trylock(&busiest->lock))) { | ||
| 1775 | if (busiest < this_rq) { | ||
| 1776 | raw_spin_unlock(&this_rq->lock); | ||
| 1777 | raw_spin_lock(&busiest->lock); | ||
| 1778 | raw_spin_lock_nested(&this_rq->lock, | ||
| 1779 | SINGLE_DEPTH_NESTING); | ||
| 1780 | ret = 1; | ||
| 1781 | } else | ||
| 1782 | raw_spin_lock_nested(&busiest->lock, | ||
| 1783 | SINGLE_DEPTH_NESTING); | ||
| 1784 | } | ||
| 1785 | return ret; | ||
| 1786 | } | ||
| 1787 | |||
| 1788 | #endif /* CONFIG_PREEMPT */ | ||
| 1789 | |||
| 1790 | /* | ||
| 1791 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | ||
| 1792 | */ | ||
| 1793 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
| 1794 | { | ||
| 1795 | if (unlikely(!irqs_disabled())) { | ||
| 1796 | /* printk() doesn't work good under rq->lock */ | ||
| 1797 | raw_spin_unlock(&this_rq->lock); | ||
| 1798 | BUG_ON(1); | ||
| 1799 | } | ||
| 1800 | |||
| 1801 | return _double_lock_balance(this_rq, busiest); | ||
| 1802 | } | ||
| 1803 | |||
| 1804 | static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | ||
| 1805 | __releases(busiest->lock) | ||
| 1806 | { | ||
| 1807 | raw_spin_unlock(&busiest->lock); | ||
| 1808 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | ||
| 1809 | } | ||
| 1810 | |||
| 1811 | /* | ||
| 1812 | * double_rq_lock - safely lock two runqueues | ||
| 1813 | * | ||
| 1814 | * Note this does not disable interrupts like task_rq_lock, | ||
| 1815 | * you need to do so manually before calling. | ||
| 1816 | */ | ||
| 1817 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
| 1818 | __acquires(rq1->lock) | ||
| 1819 | __acquires(rq2->lock) | ||
| 1820 | { | ||
| 1821 | BUG_ON(!irqs_disabled()); | ||
| 1822 | if (rq1 == rq2) { | ||
| 1823 | raw_spin_lock(&rq1->lock); | ||
| 1824 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
| 1825 | } else { | ||
| 1826 | if (rq1 < rq2) { | ||
| 1827 | raw_spin_lock(&rq1->lock); | ||
| 1828 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | ||
| 1829 | } else { | ||
| 1830 | raw_spin_lock(&rq2->lock); | ||
| 1831 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | ||
| 1832 | } | ||
| 1833 | } | ||
| 1834 | } | ||
| 1835 | |||
| 1836 | /* | ||
| 1837 | * double_rq_unlock - safely unlock two runqueues | ||
| 1838 | * | ||
| 1839 | * Note this does not restore interrupts like task_rq_unlock, | ||
| 1840 | * you need to do so manually after calling. | ||
| 1841 | */ | ||
| 1842 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
| 1843 | __releases(rq1->lock) | ||
| 1844 | __releases(rq2->lock) | ||
| 1845 | { | ||
| 1846 | raw_spin_unlock(&rq1->lock); | ||
| 1847 | if (rq1 != rq2) | ||
| 1848 | raw_spin_unlock(&rq2->lock); | ||
| 1849 | else | ||
| 1850 | __release(rq2->lock); | ||
| 1851 | } | ||
| 1852 | |||
| 1853 | #else /* CONFIG_SMP */ | ||
| 1854 | |||
| 1855 | /* | ||
| 1856 | * double_rq_lock - safely lock two runqueues | ||
| 1857 | * | ||
| 1858 | * Note this does not disable interrupts like task_rq_lock, | ||
| 1859 | * you need to do so manually before calling. | ||
| 1860 | */ | ||
| 1861 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
| 1862 | __acquires(rq1->lock) | ||
| 1863 | __acquires(rq2->lock) | ||
| 1864 | { | ||
| 1865 | BUG_ON(!irqs_disabled()); | ||
| 1866 | BUG_ON(rq1 != rq2); | ||
| 1867 | raw_spin_lock(&rq1->lock); | ||
| 1868 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
| 1869 | } | ||
| 1870 | |||
| 1871 | /* | ||
| 1872 | * double_rq_unlock - safely unlock two runqueues | ||
| 1873 | * | ||
| 1874 | * Note this does not restore interrupts like task_rq_unlock, | ||
| 1875 | * you need to do so manually after calling. | ||
| 1876 | */ | ||
| 1877 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
| 1878 | __releases(rq1->lock) | ||
| 1879 | __releases(rq2->lock) | ||
| 1880 | { | ||
| 1881 | BUG_ON(rq1 != rq2); | ||
| 1882 | raw_spin_unlock(&rq1->lock); | ||
| 1883 | __release(rq2->lock); | ||
| 1884 | } | ||
| 1885 | |||
| 1886 | #endif | 690 | #endif |
| 1887 | 691 | ||
| 1888 | static void calc_load_account_idle(struct rq *this_rq); | 692 | void update_cpu_load(struct rq *this_rq); |
| 1889 | static void update_sysctl(void); | ||
| 1890 | static int get_update_sysctl_factor(void); | ||
| 1891 | static void update_cpu_load(struct rq *this_rq); | ||
| 1892 | |||
| 1893 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | ||
| 1894 | { | ||
| 1895 | set_task_rq(p, cpu); | ||
| 1896 | #ifdef CONFIG_SMP | ||
| 1897 | /* | ||
| 1898 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | ||
| 1899 | * successfully executed on another CPU. We must ensure that updates of | ||
| 1900 | * per-task data have been completed by this moment. | ||
| 1901 | */ | ||
| 1902 | smp_wmb(); | ||
| 1903 | task_thread_info(p)->cpu = cpu; | ||
| 1904 | #endif | ||
| 1905 | } | ||
| 1906 | |||
| 1907 | static const struct sched_class rt_sched_class; | ||
| 1908 | |||
| 1909 | #define sched_class_highest (&stop_sched_class) | ||
| 1910 | #define for_each_class(class) \ | ||
| 1911 | for (class = sched_class_highest; class; class = class->next) | ||
| 1912 | |||
| 1913 | #include "sched_stats.h" | ||
| 1914 | |||
| 1915 | static void inc_nr_running(struct rq *rq) | ||
| 1916 | { | ||
| 1917 | rq->nr_running++; | ||
| 1918 | } | ||
| 1919 | |||
| 1920 | static void dec_nr_running(struct rq *rq) | ||
| 1921 | { | ||
| 1922 | rq->nr_running--; | ||
| 1923 | } | ||
| 1924 | 693 | ||
| 1925 | static void set_load_weight(struct task_struct *p) | 694 | static void set_load_weight(struct task_struct *p) |
| 1926 | { | 695 | { |
| @@ -1957,7 +726,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | |||
| 1957 | /* | 726 | /* |
| 1958 | * activate_task - move a task to the runqueue. | 727 | * activate_task - move a task to the runqueue. |
| 1959 | */ | 728 | */ |
| 1960 | static void activate_task(struct rq *rq, struct task_struct *p, int flags) | 729 | void activate_task(struct rq *rq, struct task_struct *p, int flags) |
| 1961 | { | 730 | { |
| 1962 | if (task_contributes_to_load(p)) | 731 | if (task_contributes_to_load(p)) |
| 1963 | rq->nr_uninterruptible--; | 732 | rq->nr_uninterruptible--; |
| @@ -1968,7 +737,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags) | |||
| 1968 | /* | 737 | /* |
| 1969 | * deactivate_task - remove a task from the runqueue. | 738 | * deactivate_task - remove a task from the runqueue. |
| 1970 | */ | 739 | */ |
| 1971 | static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | 740 | void deactivate_task(struct rq *rq, struct task_struct *p, int flags) |
| 1972 | { | 741 | { |
| 1973 | if (task_contributes_to_load(p)) | 742 | if (task_contributes_to_load(p)) |
| 1974 | rq->nr_uninterruptible++; | 743 | rq->nr_uninterruptible++; |
| @@ -2159,14 +928,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
| 2159 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 928 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
| 2160 | static int irqtime_account_hi_update(void) | 929 | static int irqtime_account_hi_update(void) |
| 2161 | { | 930 | { |
| 2162 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 931 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
| 2163 | unsigned long flags; | 932 | unsigned long flags; |
| 2164 | u64 latest_ns; | 933 | u64 latest_ns; |
| 2165 | int ret = 0; | 934 | int ret = 0; |
| 2166 | 935 | ||
| 2167 | local_irq_save(flags); | 936 | local_irq_save(flags); |
| 2168 | latest_ns = this_cpu_read(cpu_hardirq_time); | 937 | latest_ns = this_cpu_read(cpu_hardirq_time); |
| 2169 | if (nsecs_to_cputime64(latest_ns) > cpustat->irq) | 938 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) |
| 2170 | ret = 1; | 939 | ret = 1; |
| 2171 | local_irq_restore(flags); | 940 | local_irq_restore(flags); |
| 2172 | return ret; | 941 | return ret; |
| @@ -2174,14 +943,14 @@ static int irqtime_account_hi_update(void) | |||
| 2174 | 943 | ||
| 2175 | static int irqtime_account_si_update(void) | 944 | static int irqtime_account_si_update(void) |
| 2176 | { | 945 | { |
| 2177 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 946 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
| 2178 | unsigned long flags; | 947 | unsigned long flags; |
| 2179 | u64 latest_ns; | 948 | u64 latest_ns; |
| 2180 | int ret = 0; | 949 | int ret = 0; |
| 2181 | 950 | ||
| 2182 | local_irq_save(flags); | 951 | local_irq_save(flags); |
| 2183 | latest_ns = this_cpu_read(cpu_softirq_time); | 952 | latest_ns = this_cpu_read(cpu_softirq_time); |
| 2184 | if (nsecs_to_cputime64(latest_ns) > cpustat->softirq) | 953 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) |
| 2185 | ret = 1; | 954 | ret = 1; |
| 2186 | local_irq_restore(flags); | 955 | local_irq_restore(flags); |
| 2187 | return ret; | 956 | return ret; |
| @@ -2193,15 +962,6 @@ static int irqtime_account_si_update(void) | |||
| 2193 | 962 | ||
| 2194 | #endif | 963 | #endif |
| 2195 | 964 | ||
| 2196 | #include "sched_idletask.c" | ||
| 2197 | #include "sched_fair.c" | ||
| 2198 | #include "sched_rt.c" | ||
| 2199 | #include "sched_autogroup.c" | ||
| 2200 | #include "sched_stoptask.c" | ||
| 2201 | #ifdef CONFIG_SCHED_DEBUG | ||
| 2202 | # include "sched_debug.c" | ||
| 2203 | #endif | ||
| 2204 | |||
| 2205 | void sched_set_stop_task(int cpu, struct task_struct *stop) | 965 | void sched_set_stop_task(int cpu, struct task_struct *stop) |
| 2206 | { | 966 | { |
| 2207 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | 967 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; |
| @@ -2299,7 +1059,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
| 2299 | p->sched_class->prio_changed(rq, p, oldprio); | 1059 | p->sched_class->prio_changed(rq, p, oldprio); |
| 2300 | } | 1060 | } |
| 2301 | 1061 | ||
| 2302 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | 1062 | void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) |
| 2303 | { | 1063 | { |
| 2304 | const struct sched_class *class; | 1064 | const struct sched_class *class; |
| 2305 | 1065 | ||
| @@ -2325,38 +1085,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
| 2325 | } | 1085 | } |
| 2326 | 1086 | ||
| 2327 | #ifdef CONFIG_SMP | 1087 | #ifdef CONFIG_SMP |
| 2328 | /* | ||
| 2329 | * Is this task likely cache-hot: | ||
| 2330 | */ | ||
| 2331 | static int | ||
| 2332 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | ||
| 2333 | { | ||
| 2334 | s64 delta; | ||
| 2335 | |||
| 2336 | if (p->sched_class != &fair_sched_class) | ||
| 2337 | return 0; | ||
| 2338 | |||
| 2339 | if (unlikely(p->policy == SCHED_IDLE)) | ||
| 2340 | return 0; | ||
| 2341 | |||
| 2342 | /* | ||
| 2343 | * Buddy candidates are cache hot: | ||
| 2344 | */ | ||
| 2345 | if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && | ||
| 2346 | (&p->se == cfs_rq_of(&p->se)->next || | ||
| 2347 | &p->se == cfs_rq_of(&p->se)->last)) | ||
| 2348 | return 1; | ||
| 2349 | |||
| 2350 | if (sysctl_sched_migration_cost == -1) | ||
| 2351 | return 1; | ||
| 2352 | if (sysctl_sched_migration_cost == 0) | ||
| 2353 | return 0; | ||
| 2354 | |||
| 2355 | delta = now - p->se.exec_start; | ||
| 2356 | |||
| 2357 | return delta < (s64)sysctl_sched_migration_cost; | ||
| 2358 | } | ||
| 2359 | |||
| 2360 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 1088 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
| 2361 | { | 1089 | { |
| 2362 | #ifdef CONFIG_SCHED_DEBUG | 1090 | #ifdef CONFIG_SCHED_DEBUG |
| @@ -3439,7 +2167,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
| 3439 | */ | 2167 | */ |
| 3440 | static atomic_long_t calc_load_tasks_idle; | 2168 | static atomic_long_t calc_load_tasks_idle; |
| 3441 | 2169 | ||
| 3442 | static void calc_load_account_idle(struct rq *this_rq) | 2170 | void calc_load_account_idle(struct rq *this_rq) |
| 3443 | { | 2171 | { |
| 3444 | long delta; | 2172 | long delta; |
| 3445 | 2173 | ||
| @@ -3583,7 +2311,7 @@ static void calc_global_nohz(unsigned long ticks) | |||
| 3583 | */ | 2311 | */ |
| 3584 | } | 2312 | } |
| 3585 | #else | 2313 | #else |
| 3586 | static void calc_load_account_idle(struct rq *this_rq) | 2314 | void calc_load_account_idle(struct rq *this_rq) |
| 3587 | { | 2315 | { |
| 3588 | } | 2316 | } |
| 3589 | 2317 | ||
| @@ -3726,7 +2454,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | |||
| 3726 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | 2454 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called |
| 3727 | * every tick. We fix it up based on jiffies. | 2455 | * every tick. We fix it up based on jiffies. |
| 3728 | */ | 2456 | */ |
| 3729 | static void update_cpu_load(struct rq *this_rq) | 2457 | void update_cpu_load(struct rq *this_rq) |
| 3730 | { | 2458 | { |
| 3731 | unsigned long this_load = this_rq->load.weight; | 2459 | unsigned long this_load = this_rq->load.weight; |
| 3732 | unsigned long curr_jiffies = jiffies; | 2460 | unsigned long curr_jiffies = jiffies; |
| @@ -3804,8 +2532,10 @@ unlock: | |||
| 3804 | #endif | 2532 | #endif |
| 3805 | 2533 | ||
| 3806 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 2534 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
| 2535 | DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); | ||
| 3807 | 2536 | ||
| 3808 | EXPORT_PER_CPU_SYMBOL(kstat); | 2537 | EXPORT_PER_CPU_SYMBOL(kstat); |
| 2538 | EXPORT_PER_CPU_SYMBOL(kernel_cpustat); | ||
| 3809 | 2539 | ||
| 3810 | /* | 2540 | /* |
| 3811 | * Return any ns on the sched_clock that have not yet been accounted in | 2541 | * Return any ns on the sched_clock that have not yet been accounted in |
| @@ -3858,6 +2588,42 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
| 3858 | return ns; | 2588 | return ns; |
| 3859 | } | 2589 | } |
| 3860 | 2590 | ||
| 2591 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 2592 | struct cgroup_subsys cpuacct_subsys; | ||
| 2593 | struct cpuacct root_cpuacct; | ||
| 2594 | #endif | ||
| 2595 | |||
| 2596 | static inline void task_group_account_field(struct task_struct *p, int index, | ||
| 2597 | u64 tmp) | ||
| 2598 | { | ||
| 2599 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 2600 | struct kernel_cpustat *kcpustat; | ||
| 2601 | struct cpuacct *ca; | ||
| 2602 | #endif | ||
| 2603 | /* | ||
| 2604 | * Since all updates are sure to touch the root cgroup, we | ||
| 2605 | * get ourselves ahead and touch it first. If the root cgroup | ||
| 2606 | * is the only cgroup, then nothing else should be necessary. | ||
| 2607 | * | ||
| 2608 | */ | ||
| 2609 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | ||
| 2610 | |||
| 2611 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 2612 | if (unlikely(!cpuacct_subsys.active)) | ||
| 2613 | return; | ||
| 2614 | |||
| 2615 | rcu_read_lock(); | ||
| 2616 | ca = task_ca(p); | ||
| 2617 | while (ca && (ca != &root_cpuacct)) { | ||
| 2618 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
| 2619 | kcpustat->cpustat[index] += tmp; | ||
| 2620 | ca = parent_ca(ca); | ||
| 2621 | } | ||
| 2622 | rcu_read_unlock(); | ||
| 2623 | #endif | ||
| 2624 | } | ||
| 2625 | |||
| 2626 | |||
| 3861 | /* | 2627 | /* |
| 3862 | * Account user cpu time to a process. | 2628 | * Account user cpu time to a process. |
| 3863 | * @p: the process that the cpu time gets accounted to | 2629 | * @p: the process that the cpu time gets accounted to |
| @@ -3867,20 +2633,18 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
| 3867 | void account_user_time(struct task_struct *p, cputime_t cputime, | 2633 | void account_user_time(struct task_struct *p, cputime_t cputime, |
| 3868 | cputime_t cputime_scaled) | 2634 | cputime_t cputime_scaled) |
| 3869 | { | 2635 | { |
| 3870 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2636 | int index; |
| 3871 | 2637 | ||
| 3872 | /* Add user time to process. */ | 2638 | /* Add user time to process. */ |
| 3873 | p->utime += cputime; | 2639 | p->utime += cputime; |
| 3874 | p->utimescaled += cputime_scaled; | 2640 | p->utimescaled += cputime_scaled; |
| 3875 | account_group_user_time(p, cputime); | 2641 | account_group_user_time(p, cputime); |
| 3876 | 2642 | ||
| 2643 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | ||
| 2644 | |||
| 3877 | /* Add user time to cpustat. */ | 2645 | /* Add user time to cpustat. */ |
| 3878 | if (TASK_NICE(p) > 0) | 2646 | task_group_account_field(p, index, (__force u64) cputime); |
| 3879 | cpustat->nice += (__force cputime64_t) cputime; | ||
| 3880 | else | ||
| 3881 | cpustat->user += (__force cputime64_t) cputime; | ||
| 3882 | 2647 | ||
| 3883 | cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); | ||
| 3884 | /* Account for user time used */ | 2648 | /* Account for user time used */ |
| 3885 | acct_update_integrals(p); | 2649 | acct_update_integrals(p); |
| 3886 | } | 2650 | } |
| @@ -3894,7 +2658,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime, | |||
| 3894 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | 2658 | static void account_guest_time(struct task_struct *p, cputime_t cputime, |
| 3895 | cputime_t cputime_scaled) | 2659 | cputime_t cputime_scaled) |
| 3896 | { | 2660 | { |
| 3897 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2661 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
| 3898 | 2662 | ||
| 3899 | /* Add guest time to process. */ | 2663 | /* Add guest time to process. */ |
| 3900 | p->utime += cputime; | 2664 | p->utime += cputime; |
| @@ -3904,11 +2668,11 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
| 3904 | 2668 | ||
| 3905 | /* Add guest time to cpustat. */ | 2669 | /* Add guest time to cpustat. */ |
| 3906 | if (TASK_NICE(p) > 0) { | 2670 | if (TASK_NICE(p) > 0) { |
| 3907 | cpustat->nice += (__force cputime64_t) cputime; | 2671 | cpustat[CPUTIME_NICE] += (__force u64) cputime; |
| 3908 | cpustat->guest_nice += (__force cputime64_t) cputime; | 2672 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; |
| 3909 | } else { | 2673 | } else { |
| 3910 | cpustat->user += (__force cputime64_t) cputime; | 2674 | cpustat[CPUTIME_USER] += (__force u64) cputime; |
| 3911 | cpustat->guest += (__force cputime64_t) cputime; | 2675 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; |
| 3912 | } | 2676 | } |
| 3913 | } | 2677 | } |
| 3914 | 2678 | ||
| @@ -3921,7 +2685,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
| 3921 | */ | 2685 | */ |
| 3922 | static inline | 2686 | static inline |
| 3923 | void __account_system_time(struct task_struct *p, cputime_t cputime, | 2687 | void __account_system_time(struct task_struct *p, cputime_t cputime, |
| 3924 | cputime_t cputime_scaled, cputime64_t *target_cputime64) | 2688 | cputime_t cputime_scaled, int index) |
| 3925 | { | 2689 | { |
| 3926 | /* Add system time to process. */ | 2690 | /* Add system time to process. */ |
| 3927 | p->stime += cputime; | 2691 | p->stime += cputime; |
| @@ -3929,8 +2693,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, | |||
| 3929 | account_group_system_time(p, cputime); | 2693 | account_group_system_time(p, cputime); |
| 3930 | 2694 | ||
| 3931 | /* Add system time to cpustat. */ | 2695 | /* Add system time to cpustat. */ |
| 3932 | *target_cputime64 += (__force cputime64_t) cputime; | 2696 | task_group_account_field(p, index, (__force u64) cputime); |
| 3933 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
| 3934 | 2697 | ||
| 3935 | /* Account for system time used */ | 2698 | /* Account for system time used */ |
| 3936 | acct_update_integrals(p); | 2699 | acct_update_integrals(p); |
| @@ -3946,8 +2709,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, | |||
| 3946 | void account_system_time(struct task_struct *p, int hardirq_offset, | 2709 | void account_system_time(struct task_struct *p, int hardirq_offset, |
| 3947 | cputime_t cputime, cputime_t cputime_scaled) | 2710 | cputime_t cputime, cputime_t cputime_scaled) |
| 3948 | { | 2711 | { |
| 3949 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2712 | int index; |
| 3950 | cputime64_t *target_cputime64; | ||
| 3951 | 2713 | ||
| 3952 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | 2714 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { |
| 3953 | account_guest_time(p, cputime, cputime_scaled); | 2715 | account_guest_time(p, cputime, cputime_scaled); |
| @@ -3955,13 +2717,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
| 3955 | } | 2717 | } |
| 3956 | 2718 | ||
| 3957 | if (hardirq_count() - hardirq_offset) | 2719 | if (hardirq_count() - hardirq_offset) |
| 3958 | target_cputime64 = &cpustat->irq; | 2720 | index = CPUTIME_IRQ; |
| 3959 | else if (in_serving_softirq()) | 2721 | else if (in_serving_softirq()) |
| 3960 | target_cputime64 = &cpustat->softirq; | 2722 | index = CPUTIME_SOFTIRQ; |
| 3961 | else | 2723 | else |
| 3962 | target_cputime64 = &cpustat->system; | 2724 | index = CPUTIME_SYSTEM; |
| 3963 | 2725 | ||
| 3964 | __account_system_time(p, cputime, cputime_scaled, target_cputime64); | 2726 | __account_system_time(p, cputime, cputime_scaled, index); |
| 3965 | } | 2727 | } |
| 3966 | 2728 | ||
| 3967 | /* | 2729 | /* |
| @@ -3970,9 +2732,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
| 3970 | */ | 2732 | */ |
| 3971 | void account_steal_time(cputime_t cputime) | 2733 | void account_steal_time(cputime_t cputime) |
| 3972 | { | 2734 | { |
| 3973 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2735 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
| 3974 | 2736 | ||
| 3975 | cpustat->steal += (__force cputime64_t) cputime; | 2737 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; |
| 3976 | } | 2738 | } |
| 3977 | 2739 | ||
| 3978 | /* | 2740 | /* |
| @@ -3981,13 +2743,13 @@ void account_steal_time(cputime_t cputime) | |||
| 3981 | */ | 2743 | */ |
| 3982 | void account_idle_time(cputime_t cputime) | 2744 | void account_idle_time(cputime_t cputime) |
| 3983 | { | 2745 | { |
| 3984 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2746 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
| 3985 | struct rq *rq = this_rq(); | 2747 | struct rq *rq = this_rq(); |
| 3986 | 2748 | ||
| 3987 | if (atomic_read(&rq->nr_iowait) > 0) | 2749 | if (atomic_read(&rq->nr_iowait) > 0) |
| 3988 | cpustat->iowait += (__force cputime64_t) cputime; | 2750 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; |
| 3989 | else | 2751 | else |
| 3990 | cpustat->idle += (__force cputime64_t) cputime; | 2752 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; |
| 3991 | } | 2753 | } |
| 3992 | 2754 | ||
| 3993 | static __always_inline bool steal_account_process_tick(void) | 2755 | static __always_inline bool steal_account_process_tick(void) |
| @@ -4037,15 +2799,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
| 4037 | struct rq *rq) | 2799 | struct rq *rq) |
| 4038 | { | 2800 | { |
| 4039 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 2801 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
| 4040 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2802 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
| 4041 | 2803 | ||
| 4042 | if (steal_account_process_tick()) | 2804 | if (steal_account_process_tick()) |
| 4043 | return; | 2805 | return; |
| 4044 | 2806 | ||
| 4045 | if (irqtime_account_hi_update()) { | 2807 | if (irqtime_account_hi_update()) { |
| 4046 | cpustat->irq += (__force cputime64_t) cputime_one_jiffy; | 2808 | cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; |
| 4047 | } else if (irqtime_account_si_update()) { | 2809 | } else if (irqtime_account_si_update()) { |
| 4048 | cpustat->softirq += (__force cputime64_t) cputime_one_jiffy; | 2810 | cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; |
| 4049 | } else if (this_cpu_ksoftirqd() == p) { | 2811 | } else if (this_cpu_ksoftirqd() == p) { |
| 4050 | /* | 2812 | /* |
| 4051 | * ksoftirqd time do not get accounted in cpu_softirq_time. | 2813 | * ksoftirqd time do not get accounted in cpu_softirq_time. |
| @@ -4053,7 +2815,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
| 4053 | * Also, p->stime needs to be updated for ksoftirqd. | 2815 | * Also, p->stime needs to be updated for ksoftirqd. |
| 4054 | */ | 2816 | */ |
| 4055 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | 2817 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, |
| 4056 | &cpustat->softirq); | 2818 | CPUTIME_SOFTIRQ); |
| 4057 | } else if (user_tick) { | 2819 | } else if (user_tick) { |
| 4058 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 2820 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
| 4059 | } else if (p == rq->idle) { | 2821 | } else if (p == rq->idle) { |
| @@ -4062,7 +2824,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
| 4062 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | 2824 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); |
| 4063 | } else { | 2825 | } else { |
| 4064 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | 2826 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, |
| 4065 | &cpustat->system); | 2827 | CPUTIME_SYSTEM); |
| 4066 | } | 2828 | } |
| 4067 | } | 2829 | } |
| 4068 | 2830 | ||
| @@ -5841,6 +4603,13 @@ again: | |||
| 5841 | */ | 4603 | */ |
| 5842 | if (preempt && rq != p_rq) | 4604 | if (preempt && rq != p_rq) |
| 5843 | resched_task(p_rq->curr); | 4605 | resched_task(p_rq->curr); |
| 4606 | } else { | ||
| 4607 | /* | ||
| 4608 | * We might have set it in task_yield_fair(), but are | ||
| 4609 | * not going to schedule(), so don't want to skip | ||
| 4610 | * the next update. | ||
| 4611 | */ | ||
| 4612 | rq->skip_clock_update = 0; | ||
| 5844 | } | 4613 | } |
| 5845 | 4614 | ||
| 5846 | out: | 4615 | out: |
| @@ -6008,7 +4777,7 @@ void sched_show_task(struct task_struct *p) | |||
| 6008 | free = stack_not_used(p); | 4777 | free = stack_not_used(p); |
| 6009 | #endif | 4778 | #endif |
| 6010 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, | 4779 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
| 6011 | task_pid_nr(p), task_pid_nr(p->real_parent), | 4780 | task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), |
| 6012 | (unsigned long)task_thread_info(p)->flags); | 4781 | (unsigned long)task_thread_info(p)->flags); |
| 6013 | 4782 | ||
| 6014 | show_stack(p, NULL); | 4783 | show_stack(p, NULL); |
| @@ -6107,53 +4876,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
| 6107 | #endif | 4876 | #endif |
| 6108 | } | 4877 | } |
| 6109 | 4878 | ||
| 6110 | /* | ||
| 6111 | * Increase the granularity value when there are more CPUs, | ||
| 6112 | * because with more CPUs the 'effective latency' as visible | ||
| 6113 | * to users decreases. But the relationship is not linear, | ||
| 6114 | * so pick a second-best guess by going with the log2 of the | ||
| 6115 | * number of CPUs. | ||
| 6116 | * | ||
| 6117 | * This idea comes from the SD scheduler of Con Kolivas: | ||
| 6118 | */ | ||
| 6119 | static int get_update_sysctl_factor(void) | ||
| 6120 | { | ||
| 6121 | unsigned int cpus = min_t(int, num_online_cpus(), 8); | ||
| 6122 | unsigned int factor; | ||
| 6123 | |||
| 6124 | switch (sysctl_sched_tunable_scaling) { | ||
| 6125 | case SCHED_TUNABLESCALING_NONE: | ||
| 6126 | factor = 1; | ||
| 6127 | break; | ||
| 6128 | case SCHED_TUNABLESCALING_LINEAR: | ||
| 6129 | factor = cpus; | ||
| 6130 | break; | ||
| 6131 | case SCHED_TUNABLESCALING_LOG: | ||
| 6132 | default: | ||
| 6133 | factor = 1 + ilog2(cpus); | ||
| 6134 | break; | ||
| 6135 | } | ||
| 6136 | |||
| 6137 | return factor; | ||
| 6138 | } | ||
| 6139 | |||
| 6140 | static void update_sysctl(void) | ||
| 6141 | { | ||
| 6142 | unsigned int factor = get_update_sysctl_factor(); | ||
| 6143 | |||
| 6144 | #define SET_SYSCTL(name) \ | ||
| 6145 | (sysctl_##name = (factor) * normalized_sysctl_##name) | ||
| 6146 | SET_SYSCTL(sched_min_granularity); | ||
| 6147 | SET_SYSCTL(sched_latency); | ||
| 6148 | SET_SYSCTL(sched_wakeup_granularity); | ||
| 6149 | #undef SET_SYSCTL | ||
| 6150 | } | ||
| 6151 | |||
| 6152 | static inline void sched_init_granularity(void) | ||
| 6153 | { | ||
| 6154 | update_sysctl(); | ||
| 6155 | } | ||
| 6156 | |||
| 6157 | #ifdef CONFIG_SMP | 4879 | #ifdef CONFIG_SMP |
| 6158 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | 4880 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) |
| 6159 | { | 4881 | { |
| @@ -6340,30 +5062,6 @@ static void calc_global_load_remove(struct rq *rq) | |||
| 6340 | rq->calc_load_active = 0; | 5062 | rq->calc_load_active = 0; |
| 6341 | } | 5063 | } |
| 6342 | 5064 | ||
| 6343 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 6344 | static void unthrottle_offline_cfs_rqs(struct rq *rq) | ||
| 6345 | { | ||
| 6346 | struct cfs_rq *cfs_rq; | ||
| 6347 | |||
| 6348 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
| 6349 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
| 6350 | |||
| 6351 | if (!cfs_rq->runtime_enabled) | ||
| 6352 | continue; | ||
| 6353 | |||
| 6354 | /* | ||
| 6355 | * clock_task is not advancing so we just need to make sure | ||
| 6356 | * there's some valid quota amount | ||
| 6357 | */ | ||
| 6358 | cfs_rq->runtime_remaining = cfs_b->quota; | ||
| 6359 | if (cfs_rq_throttled(cfs_rq)) | ||
| 6360 | unthrottle_cfs_rq(cfs_rq); | ||
| 6361 | } | ||
| 6362 | } | ||
| 6363 | #else | ||
| 6364 | static void unthrottle_offline_cfs_rqs(struct rq *rq) {} | ||
| 6365 | #endif | ||
| 6366 | |||
| 6367 | /* | 5065 | /* |
| 6368 | * Migrate all tasks from the rq, sleeping tasks will be migrated by | 5066 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
| 6369 | * try_to_wake_up()->select_task_rq(). | 5067 | * try_to_wake_up()->select_task_rq(). |
| @@ -6969,6 +5667,12 @@ out: | |||
| 6969 | return -ENOMEM; | 5667 | return -ENOMEM; |
| 6970 | } | 5668 | } |
| 6971 | 5669 | ||
| 5670 | /* | ||
| 5671 | * By default the system creates a single root-domain with all cpus as | ||
| 5672 | * members (mimicking the global state we have today). | ||
| 5673 | */ | ||
| 5674 | struct root_domain def_root_domain; | ||
| 5675 | |||
| 6972 | static void init_defrootdomain(void) | 5676 | static void init_defrootdomain(void) |
| 6973 | { | 5677 | { |
| 6974 | init_rootdomain(&def_root_domain); | 5678 | init_rootdomain(&def_root_domain); |
| @@ -7237,7 +5941,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 7237 | continue; | 5941 | continue; |
| 7238 | 5942 | ||
| 7239 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 5943 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
| 7240 | GFP_KERNEL, cpu_to_node(i)); | 5944 | GFP_KERNEL, cpu_to_node(cpu)); |
| 7241 | 5945 | ||
| 7242 | if (!sg) | 5946 | if (!sg) |
| 7243 | goto fail; | 5947 | goto fail; |
| @@ -7375,6 +6079,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 7375 | return; | 6079 | return; |
| 7376 | 6080 | ||
| 7377 | update_group_power(sd, cpu); | 6081 | update_group_power(sd, cpu); |
| 6082 | atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); | ||
| 6083 | } | ||
| 6084 | |||
| 6085 | int __weak arch_sd_sibling_asym_packing(void) | ||
| 6086 | { | ||
| 6087 | return 0*SD_ASYM_PACKING; | ||
| 7378 | } | 6088 | } |
| 7379 | 6089 | ||
| 7380 | /* | 6090 | /* |
| @@ -8012,29 +6722,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | |||
| 8012 | } | 6722 | } |
| 8013 | } | 6723 | } |
| 8014 | 6724 | ||
| 8015 | static int update_runtime(struct notifier_block *nfb, | ||
| 8016 | unsigned long action, void *hcpu) | ||
| 8017 | { | ||
| 8018 | int cpu = (int)(long)hcpu; | ||
| 8019 | |||
| 8020 | switch (action) { | ||
| 8021 | case CPU_DOWN_PREPARE: | ||
| 8022 | case CPU_DOWN_PREPARE_FROZEN: | ||
| 8023 | disable_runtime(cpu_rq(cpu)); | ||
| 8024 | return NOTIFY_OK; | ||
| 8025 | |||
| 8026 | case CPU_DOWN_FAILED: | ||
| 8027 | case CPU_DOWN_FAILED_FROZEN: | ||
| 8028 | case CPU_ONLINE: | ||
| 8029 | case CPU_ONLINE_FROZEN: | ||
| 8030 | enable_runtime(cpu_rq(cpu)); | ||
| 8031 | return NOTIFY_OK; | ||
| 8032 | |||
| 8033 | default: | ||
| 8034 | return NOTIFY_DONE; | ||
| 8035 | } | ||
| 8036 | } | ||
| 8037 | |||
| 8038 | void __init sched_init_smp(void) | 6725 | void __init sched_init_smp(void) |
| 8039 | { | 6726 | { |
| 8040 | cpumask_var_t non_isolated_cpus; | 6727 | cpumask_var_t non_isolated_cpus; |
| @@ -8083,104 +6770,11 @@ int in_sched_functions(unsigned long addr) | |||
| 8083 | && addr < (unsigned long)__sched_text_end); | 6770 | && addr < (unsigned long)__sched_text_end); |
| 8084 | } | 6771 | } |
| 8085 | 6772 | ||
| 8086 | static void init_cfs_rq(struct cfs_rq *cfs_rq) | 6773 | #ifdef CONFIG_CGROUP_SCHED |
| 8087 | { | 6774 | struct task_group root_task_group; |
| 8088 | cfs_rq->tasks_timeline = RB_ROOT; | ||
| 8089 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
| 8090 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | ||
| 8091 | #ifndef CONFIG_64BIT | ||
| 8092 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
| 8093 | #endif | ||
| 8094 | } | ||
| 8095 | |||
| 8096 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | ||
| 8097 | { | ||
| 8098 | struct rt_prio_array *array; | ||
| 8099 | int i; | ||
| 8100 | |||
| 8101 | array = &rt_rq->active; | ||
| 8102 | for (i = 0; i < MAX_RT_PRIO; i++) { | ||
| 8103 | INIT_LIST_HEAD(array->queue + i); | ||
| 8104 | __clear_bit(i, array->bitmap); | ||
| 8105 | } | ||
| 8106 | /* delimiter for bitsearch: */ | ||
| 8107 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
| 8108 | |||
| 8109 | #if defined CONFIG_SMP | ||
| 8110 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | ||
| 8111 | rt_rq->highest_prio.next = MAX_RT_PRIO; | ||
| 8112 | rt_rq->rt_nr_migratory = 0; | ||
| 8113 | rt_rq->overloaded = 0; | ||
| 8114 | plist_head_init(&rt_rq->pushable_tasks); | ||
| 8115 | #endif | ||
| 8116 | |||
| 8117 | rt_rq->rt_time = 0; | ||
| 8118 | rt_rq->rt_throttled = 0; | ||
| 8119 | rt_rq->rt_runtime = 0; | ||
| 8120 | raw_spin_lock_init(&rt_rq->rt_runtime_lock); | ||
| 8121 | } | ||
| 8122 | |||
| 8123 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 8124 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | ||
| 8125 | struct sched_entity *se, int cpu, | ||
| 8126 | struct sched_entity *parent) | ||
| 8127 | { | ||
| 8128 | struct rq *rq = cpu_rq(cpu); | ||
| 8129 | |||
| 8130 | cfs_rq->tg = tg; | ||
| 8131 | cfs_rq->rq = rq; | ||
| 8132 | #ifdef CONFIG_SMP | ||
| 8133 | /* allow initial update_cfs_load() to truncate */ | ||
| 8134 | cfs_rq->load_stamp = 1; | ||
| 8135 | #endif | ||
| 8136 | init_cfs_rq_runtime(cfs_rq); | ||
| 8137 | |||
| 8138 | tg->cfs_rq[cpu] = cfs_rq; | ||
| 8139 | tg->se[cpu] = se; | ||
| 8140 | |||
| 8141 | /* se could be NULL for root_task_group */ | ||
| 8142 | if (!se) | ||
| 8143 | return; | ||
| 8144 | |||
| 8145 | if (!parent) | ||
| 8146 | se->cfs_rq = &rq->cfs; | ||
| 8147 | else | ||
| 8148 | se->cfs_rq = parent->my_q; | ||
| 8149 | |||
| 8150 | se->my_q = cfs_rq; | ||
| 8151 | update_load_set(&se->load, 0); | ||
| 8152 | se->parent = parent; | ||
| 8153 | } | ||
| 8154 | #endif | 6775 | #endif |
| 8155 | 6776 | ||
| 8156 | #ifdef CONFIG_RT_GROUP_SCHED | 6777 | DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
| 8157 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | ||
| 8158 | struct sched_rt_entity *rt_se, int cpu, | ||
| 8159 | struct sched_rt_entity *parent) | ||
| 8160 | { | ||
| 8161 | struct rq *rq = cpu_rq(cpu); | ||
| 8162 | |||
| 8163 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | ||
| 8164 | rt_rq->rt_nr_boosted = 0; | ||
| 8165 | rt_rq->rq = rq; | ||
| 8166 | rt_rq->tg = tg; | ||
| 8167 | |||
| 8168 | tg->rt_rq[cpu] = rt_rq; | ||
| 8169 | tg->rt_se[cpu] = rt_se; | ||
| 8170 | |||
| 8171 | if (!rt_se) | ||
| 8172 | return; | ||
| 8173 | |||
| 8174 | if (!parent) | ||
| 8175 | rt_se->rt_rq = &rq->rt; | ||
| 8176 | else | ||
| 8177 | rt_se->rt_rq = parent->my_q; | ||
| 8178 | |||
| 8179 | rt_se->my_q = rt_rq; | ||
| 8180 | rt_se->parent = parent; | ||
| 8181 | INIT_LIST_HEAD(&rt_se->run_list); | ||
| 8182 | } | ||
| 8183 | #endif | ||
| 8184 | 6778 | ||
| 8185 | void __init sched_init(void) | 6779 | void __init sched_init(void) |
| 8186 | { | 6780 | { |
| @@ -8238,9 +6832,17 @@ void __init sched_init(void) | |||
| 8238 | #ifdef CONFIG_CGROUP_SCHED | 6832 | #ifdef CONFIG_CGROUP_SCHED |
| 8239 | list_add(&root_task_group.list, &task_groups); | 6833 | list_add(&root_task_group.list, &task_groups); |
| 8240 | INIT_LIST_HEAD(&root_task_group.children); | 6834 | INIT_LIST_HEAD(&root_task_group.children); |
| 6835 | INIT_LIST_HEAD(&root_task_group.siblings); | ||
| 8241 | autogroup_init(&init_task); | 6836 | autogroup_init(&init_task); |
| 6837 | |||
| 8242 | #endif /* CONFIG_CGROUP_SCHED */ | 6838 | #endif /* CONFIG_CGROUP_SCHED */ |
| 8243 | 6839 | ||
| 6840 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 6841 | root_cpuacct.cpustat = &kernel_cpustat; | ||
| 6842 | root_cpuacct.cpuusage = alloc_percpu(u64); | ||
| 6843 | /* Too early, not expected to fail */ | ||
| 6844 | BUG_ON(!root_cpuacct.cpuusage); | ||
| 6845 | #endif | ||
| 8244 | for_each_possible_cpu(i) { | 6846 | for_each_possible_cpu(i) { |
| 8245 | struct rq *rq; | 6847 | struct rq *rq; |
| 8246 | 6848 | ||
| @@ -8252,7 +6854,7 @@ void __init sched_init(void) | |||
| 8252 | init_cfs_rq(&rq->cfs); | 6854 | init_cfs_rq(&rq->cfs); |
| 8253 | init_rt_rq(&rq->rt, rq); | 6855 | init_rt_rq(&rq->rt, rq); |
| 8254 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6856 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 8255 | root_task_group.shares = root_task_group_load; | 6857 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; |
| 8256 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6858 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
| 8257 | /* | 6859 | /* |
| 8258 | * How much cpu bandwidth does root_task_group get? | 6860 | * How much cpu bandwidth does root_task_group get? |
| @@ -8302,7 +6904,7 @@ void __init sched_init(void) | |||
| 8302 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 6904 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
| 8303 | rq_attach_root(rq, &def_root_domain); | 6905 | rq_attach_root(rq, &def_root_domain); |
| 8304 | #ifdef CONFIG_NO_HZ | 6906 | #ifdef CONFIG_NO_HZ |
| 8305 | rq->nohz_balance_kick = 0; | 6907 | rq->nohz_flags = 0; |
| 8306 | #endif | 6908 | #endif |
| 8307 | #endif | 6909 | #endif |
| 8308 | init_rq_hrtick(rq); | 6910 | init_rq_hrtick(rq); |
| @@ -8315,10 +6917,6 @@ void __init sched_init(void) | |||
| 8315 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); | 6917 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); |
| 8316 | #endif | 6918 | #endif |
| 8317 | 6919 | ||
| 8318 | #ifdef CONFIG_SMP | ||
| 8319 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); | ||
| 8320 | #endif | ||
| 8321 | |||
| 8322 | #ifdef CONFIG_RT_MUTEXES | 6920 | #ifdef CONFIG_RT_MUTEXES |
| 8323 | plist_head_init(&init_task.pi_waiters); | 6921 | plist_head_init(&init_task.pi_waiters); |
| 8324 | #endif | 6922 | #endif |
| @@ -8346,17 +6944,11 @@ void __init sched_init(void) | |||
| 8346 | 6944 | ||
| 8347 | #ifdef CONFIG_SMP | 6945 | #ifdef CONFIG_SMP |
| 8348 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | 6946 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); |
| 8349 | #ifdef CONFIG_NO_HZ | ||
| 8350 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | ||
| 8351 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); | ||
| 8352 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | ||
| 8353 | atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); | ||
| 8354 | atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); | ||
| 8355 | #endif | ||
| 8356 | /* May be allocated at isolcpus cmdline parse time */ | 6947 | /* May be allocated at isolcpus cmdline parse time */ |
| 8357 | if (cpu_isolated_map == NULL) | 6948 | if (cpu_isolated_map == NULL) |
| 8358 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 6949 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
| 8359 | #endif /* SMP */ | 6950 | #endif |
| 6951 | init_sched_fair_class(); | ||
| 8360 | 6952 | ||
| 8361 | scheduler_running = 1; | 6953 | scheduler_running = 1; |
| 8362 | } | 6954 | } |
| @@ -8508,169 +7100,14 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
| 8508 | 7100 | ||
| 8509 | #endif | 7101 | #endif |
| 8510 | 7102 | ||
| 8511 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 8512 | static void free_fair_sched_group(struct task_group *tg) | ||
| 8513 | { | ||
| 8514 | int i; | ||
| 8515 | |||
| 8516 | destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
| 8517 | |||
| 8518 | for_each_possible_cpu(i) { | ||
| 8519 | if (tg->cfs_rq) | ||
| 8520 | kfree(tg->cfs_rq[i]); | ||
| 8521 | if (tg->se) | ||
| 8522 | kfree(tg->se[i]); | ||
| 8523 | } | ||
| 8524 | |||
| 8525 | kfree(tg->cfs_rq); | ||
| 8526 | kfree(tg->se); | ||
| 8527 | } | ||
| 8528 | |||
| 8529 | static | ||
| 8530 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
| 8531 | { | ||
| 8532 | struct cfs_rq *cfs_rq; | ||
| 8533 | struct sched_entity *se; | ||
| 8534 | int i; | ||
| 8535 | |||
| 8536 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | ||
| 8537 | if (!tg->cfs_rq) | ||
| 8538 | goto err; | ||
| 8539 | tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); | ||
| 8540 | if (!tg->se) | ||
| 8541 | goto err; | ||
| 8542 | |||
| 8543 | tg->shares = NICE_0_LOAD; | ||
| 8544 | |||
| 8545 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
| 8546 | |||
| 8547 | for_each_possible_cpu(i) { | ||
| 8548 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | ||
| 8549 | GFP_KERNEL, cpu_to_node(i)); | ||
| 8550 | if (!cfs_rq) | ||
| 8551 | goto err; | ||
| 8552 | |||
| 8553 | se = kzalloc_node(sizeof(struct sched_entity), | ||
| 8554 | GFP_KERNEL, cpu_to_node(i)); | ||
| 8555 | if (!se) | ||
| 8556 | goto err_free_rq; | ||
| 8557 | |||
| 8558 | init_cfs_rq(cfs_rq); | ||
| 8559 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | ||
| 8560 | } | ||
| 8561 | |||
| 8562 | return 1; | ||
| 8563 | |||
| 8564 | err_free_rq: | ||
| 8565 | kfree(cfs_rq); | ||
| 8566 | err: | ||
| 8567 | return 0; | ||
| 8568 | } | ||
| 8569 | |||
| 8570 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | ||
| 8571 | { | ||
| 8572 | struct rq *rq = cpu_rq(cpu); | ||
| 8573 | unsigned long flags; | ||
| 8574 | |||
| 8575 | /* | ||
| 8576 | * Only empty task groups can be destroyed; so we can speculatively | ||
| 8577 | * check on_list without danger of it being re-added. | ||
| 8578 | */ | ||
| 8579 | if (!tg->cfs_rq[cpu]->on_list) | ||
| 8580 | return; | ||
| 8581 | |||
| 8582 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
| 8583 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
| 8584 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
| 8585 | } | ||
| 8586 | #else /* !CONFIG_FAIR_GROUP_SCHED */ | ||
| 8587 | static inline void free_fair_sched_group(struct task_group *tg) | ||
| 8588 | { | ||
| 8589 | } | ||
| 8590 | |||
| 8591 | static inline | ||
| 8592 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
| 8593 | { | ||
| 8594 | return 1; | ||
| 8595 | } | ||
| 8596 | |||
| 8597 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | ||
| 8598 | { | ||
| 8599 | } | ||
| 8600 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 8601 | |||
| 8602 | #ifdef CONFIG_RT_GROUP_SCHED | 7103 | #ifdef CONFIG_RT_GROUP_SCHED |
| 8603 | static void free_rt_sched_group(struct task_group *tg) | ||
| 8604 | { | ||
| 8605 | int i; | ||
| 8606 | |||
| 8607 | if (tg->rt_se) | ||
| 8608 | destroy_rt_bandwidth(&tg->rt_bandwidth); | ||
| 8609 | |||
| 8610 | for_each_possible_cpu(i) { | ||
| 8611 | if (tg->rt_rq) | ||
| 8612 | kfree(tg->rt_rq[i]); | ||
| 8613 | if (tg->rt_se) | ||
| 8614 | kfree(tg->rt_se[i]); | ||
| 8615 | } | ||
| 8616 | |||
| 8617 | kfree(tg->rt_rq); | ||
| 8618 | kfree(tg->rt_se); | ||
| 8619 | } | ||
| 8620 | |||
| 8621 | static | ||
| 8622 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
| 8623 | { | ||
| 8624 | struct rt_rq *rt_rq; | ||
| 8625 | struct sched_rt_entity *rt_se; | ||
| 8626 | int i; | ||
| 8627 | |||
| 8628 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | ||
| 8629 | if (!tg->rt_rq) | ||
| 8630 | goto err; | ||
| 8631 | tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); | ||
| 8632 | if (!tg->rt_se) | ||
| 8633 | goto err; | ||
| 8634 | |||
| 8635 | init_rt_bandwidth(&tg->rt_bandwidth, | ||
| 8636 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | ||
| 8637 | |||
| 8638 | for_each_possible_cpu(i) { | ||
| 8639 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | ||
| 8640 | GFP_KERNEL, cpu_to_node(i)); | ||
| 8641 | if (!rt_rq) | ||
| 8642 | goto err; | ||
| 8643 | |||
| 8644 | rt_se = kzalloc_node(sizeof(struct sched_rt_entity), | ||
| 8645 | GFP_KERNEL, cpu_to_node(i)); | ||
| 8646 | if (!rt_se) | ||
| 8647 | goto err_free_rq; | ||
| 8648 | |||
| 8649 | init_rt_rq(rt_rq, cpu_rq(i)); | ||
| 8650 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
| 8651 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | ||
| 8652 | } | ||
| 8653 | |||
| 8654 | return 1; | ||
| 8655 | |||
| 8656 | err_free_rq: | ||
| 8657 | kfree(rt_rq); | ||
| 8658 | err: | ||
| 8659 | return 0; | ||
| 8660 | } | ||
| 8661 | #else /* !CONFIG_RT_GROUP_SCHED */ | 7104 | #else /* !CONFIG_RT_GROUP_SCHED */ |
| 8662 | static inline void free_rt_sched_group(struct task_group *tg) | ||
| 8663 | { | ||
| 8664 | } | ||
| 8665 | |||
| 8666 | static inline | ||
| 8667 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
| 8668 | { | ||
| 8669 | return 1; | ||
| 8670 | } | ||
| 8671 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7105 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 8672 | 7106 | ||
| 8673 | #ifdef CONFIG_CGROUP_SCHED | 7107 | #ifdef CONFIG_CGROUP_SCHED |
| 7108 | /* task_group_lock serializes the addition/removal of task groups */ | ||
| 7109 | static DEFINE_SPINLOCK(task_group_lock); | ||
| 7110 | |||
| 8674 | static void free_sched_group(struct task_group *tg) | 7111 | static void free_sched_group(struct task_group *tg) |
| 8675 | { | 7112 | { |
| 8676 | free_fair_sched_group(tg); | 7113 | free_fair_sched_group(tg); |
| @@ -8776,47 +7213,6 @@ void sched_move_task(struct task_struct *tsk) | |||
| 8776 | #endif /* CONFIG_CGROUP_SCHED */ | 7213 | #endif /* CONFIG_CGROUP_SCHED */ |
| 8777 | 7214 | ||
| 8778 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7215 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 8779 | static DEFINE_MUTEX(shares_mutex); | ||
| 8780 | |||
| 8781 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | ||
| 8782 | { | ||
| 8783 | int i; | ||
| 8784 | unsigned long flags; | ||
| 8785 | |||
| 8786 | /* | ||
| 8787 | * We can't change the weight of the root cgroup. | ||
| 8788 | */ | ||
| 8789 | if (!tg->se[0]) | ||
| 8790 | return -EINVAL; | ||
| 8791 | |||
| 8792 | shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); | ||
| 8793 | |||
| 8794 | mutex_lock(&shares_mutex); | ||
| 8795 | if (tg->shares == shares) | ||
| 8796 | goto done; | ||
| 8797 | |||
| 8798 | tg->shares = shares; | ||
| 8799 | for_each_possible_cpu(i) { | ||
| 8800 | struct rq *rq = cpu_rq(i); | ||
| 8801 | struct sched_entity *se; | ||
| 8802 | |||
| 8803 | se = tg->se[i]; | ||
| 8804 | /* Propagate contribution to hierarchy */ | ||
| 8805 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
| 8806 | for_each_sched_entity(se) | ||
| 8807 | update_cfs_shares(group_cfs_rq(se)); | ||
| 8808 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
| 8809 | } | ||
| 8810 | |||
| 8811 | done: | ||
| 8812 | mutex_unlock(&shares_mutex); | ||
| 8813 | return 0; | ||
| 8814 | } | ||
| 8815 | |||
| 8816 | unsigned long sched_group_shares(struct task_group *tg) | ||
| 8817 | { | ||
| 8818 | return tg->shares; | ||
| 8819 | } | ||
| 8820 | #endif | 7216 | #endif |
| 8821 | 7217 | ||
| 8822 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) | 7218 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) |
| @@ -8841,7 +7237,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg) | |||
| 8841 | struct task_struct *g, *p; | 7237 | struct task_struct *g, *p; |
| 8842 | 7238 | ||
| 8843 | do_each_thread(g, p) { | 7239 | do_each_thread(g, p) { |
| 8844 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) | 7240 | if (rt_task(p) && task_rq(p)->rt.tg == tg) |
| 8845 | return 1; | 7241 | return 1; |
| 8846 | } while_each_thread(g, p); | 7242 | } while_each_thread(g, p); |
| 8847 | 7243 | ||
| @@ -9192,8 +7588,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); | |||
| 9192 | 7588 | ||
| 9193 | static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | 7589 | static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) |
| 9194 | { | 7590 | { |
| 9195 | int i, ret = 0, runtime_enabled; | 7591 | int i, ret = 0, runtime_enabled, runtime_was_enabled; |
| 9196 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | 7592 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; |
| 9197 | 7593 | ||
| 9198 | if (tg == &root_task_group) | 7594 | if (tg == &root_task_group) |
| 9199 | return -EINVAL; | 7595 | return -EINVAL; |
| @@ -9220,6 +7616,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
| 9220 | goto out_unlock; | 7616 | goto out_unlock; |
| 9221 | 7617 | ||
| 9222 | runtime_enabled = quota != RUNTIME_INF; | 7618 | runtime_enabled = quota != RUNTIME_INF; |
| 7619 | runtime_was_enabled = cfs_b->quota != RUNTIME_INF; | ||
| 7620 | account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); | ||
| 9223 | raw_spin_lock_irq(&cfs_b->lock); | 7621 | raw_spin_lock_irq(&cfs_b->lock); |
| 9224 | cfs_b->period = ns_to_ktime(period); | 7622 | cfs_b->period = ns_to_ktime(period); |
| 9225 | cfs_b->quota = quota; | 7623 | cfs_b->quota = quota; |
| @@ -9235,13 +7633,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
| 9235 | 7633 | ||
| 9236 | for_each_possible_cpu(i) { | 7634 | for_each_possible_cpu(i) { |
| 9237 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; | 7635 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; |
| 9238 | struct rq *rq = rq_of(cfs_rq); | 7636 | struct rq *rq = cfs_rq->rq; |
| 9239 | 7637 | ||
| 9240 | raw_spin_lock_irq(&rq->lock); | 7638 | raw_spin_lock_irq(&rq->lock); |
| 9241 | cfs_rq->runtime_enabled = runtime_enabled; | 7639 | cfs_rq->runtime_enabled = runtime_enabled; |
| 9242 | cfs_rq->runtime_remaining = 0; | 7640 | cfs_rq->runtime_remaining = 0; |
| 9243 | 7641 | ||
| 9244 | if (cfs_rq_throttled(cfs_rq)) | 7642 | if (cfs_rq->throttled) |
| 9245 | unthrottle_cfs_rq(cfs_rq); | 7643 | unthrottle_cfs_rq(cfs_rq); |
| 9246 | raw_spin_unlock_irq(&rq->lock); | 7644 | raw_spin_unlock_irq(&rq->lock); |
| 9247 | } | 7645 | } |
| @@ -9255,7 +7653,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) | |||
| 9255 | { | 7653 | { |
| 9256 | u64 quota, period; | 7654 | u64 quota, period; |
| 9257 | 7655 | ||
| 9258 | period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | 7656 | period = ktime_to_ns(tg->cfs_bandwidth.period); |
| 9259 | if (cfs_quota_us < 0) | 7657 | if (cfs_quota_us < 0) |
| 9260 | quota = RUNTIME_INF; | 7658 | quota = RUNTIME_INF; |
| 9261 | else | 7659 | else |
| @@ -9268,10 +7666,10 @@ long tg_get_cfs_quota(struct task_group *tg) | |||
| 9268 | { | 7666 | { |
| 9269 | u64 quota_us; | 7667 | u64 quota_us; |
| 9270 | 7668 | ||
| 9271 | if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) | 7669 | if (tg->cfs_bandwidth.quota == RUNTIME_INF) |
| 9272 | return -1; | 7670 | return -1; |
| 9273 | 7671 | ||
| 9274 | quota_us = tg_cfs_bandwidth(tg)->quota; | 7672 | quota_us = tg->cfs_bandwidth.quota; |
| 9275 | do_div(quota_us, NSEC_PER_USEC); | 7673 | do_div(quota_us, NSEC_PER_USEC); |
| 9276 | 7674 | ||
| 9277 | return quota_us; | 7675 | return quota_us; |
| @@ -9282,7 +7680,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) | |||
| 9282 | u64 quota, period; | 7680 | u64 quota, period; |
| 9283 | 7681 | ||
| 9284 | period = (u64)cfs_period_us * NSEC_PER_USEC; | 7682 | period = (u64)cfs_period_us * NSEC_PER_USEC; |
| 9285 | quota = tg_cfs_bandwidth(tg)->quota; | 7683 | quota = tg->cfs_bandwidth.quota; |
| 9286 | 7684 | ||
| 9287 | if (period <= 0) | 7685 | if (period <= 0) |
| 9288 | return -EINVAL; | 7686 | return -EINVAL; |
| @@ -9294,7 +7692,7 @@ long tg_get_cfs_period(struct task_group *tg) | |||
| 9294 | { | 7692 | { |
| 9295 | u64 cfs_period_us; | 7693 | u64 cfs_period_us; |
| 9296 | 7694 | ||
| 9297 | cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | 7695 | cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); |
| 9298 | do_div(cfs_period_us, NSEC_PER_USEC); | 7696 | do_div(cfs_period_us, NSEC_PER_USEC); |
| 9299 | 7697 | ||
| 9300 | return cfs_period_us; | 7698 | return cfs_period_us; |
| @@ -9354,13 +7752,13 @@ static u64 normalize_cfs_quota(struct task_group *tg, | |||
| 9354 | static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | 7752 | static int tg_cfs_schedulable_down(struct task_group *tg, void *data) |
| 9355 | { | 7753 | { |
| 9356 | struct cfs_schedulable_data *d = data; | 7754 | struct cfs_schedulable_data *d = data; |
| 9357 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | 7755 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; |
| 9358 | s64 quota = 0, parent_quota = -1; | 7756 | s64 quota = 0, parent_quota = -1; |
| 9359 | 7757 | ||
| 9360 | if (!tg->parent) { | 7758 | if (!tg->parent) { |
| 9361 | quota = RUNTIME_INF; | 7759 | quota = RUNTIME_INF; |
| 9362 | } else { | 7760 | } else { |
| 9363 | struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); | 7761 | struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; |
| 9364 | 7762 | ||
| 9365 | quota = normalize_cfs_quota(tg, d); | 7763 | quota = normalize_cfs_quota(tg, d); |
| 9366 | parent_quota = parent_b->hierarchal_quota; | 7764 | parent_quota = parent_b->hierarchal_quota; |
| @@ -9404,7 +7802,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | |||
| 9404 | struct cgroup_map_cb *cb) | 7802 | struct cgroup_map_cb *cb) |
| 9405 | { | 7803 | { |
| 9406 | struct task_group *tg = cgroup_tg(cgrp); | 7804 | struct task_group *tg = cgroup_tg(cgrp); |
| 9407 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | 7805 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; |
| 9408 | 7806 | ||
| 9409 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); | 7807 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); |
| 9410 | cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); | 7808 | cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); |
| @@ -9505,38 +7903,16 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
| 9505 | * (balbir@in.ibm.com). | 7903 | * (balbir@in.ibm.com). |
| 9506 | */ | 7904 | */ |
| 9507 | 7905 | ||
| 9508 | /* track cpu usage of a group of tasks and its child groups */ | ||
| 9509 | struct cpuacct { | ||
| 9510 | struct cgroup_subsys_state css; | ||
| 9511 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
| 9512 | u64 __percpu *cpuusage; | ||
| 9513 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; | ||
| 9514 | struct cpuacct *parent; | ||
| 9515 | }; | ||
| 9516 | |||
| 9517 | struct cgroup_subsys cpuacct_subsys; | ||
| 9518 | |||
| 9519 | /* return cpu accounting group corresponding to this container */ | ||
| 9520 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
| 9521 | { | ||
| 9522 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | ||
| 9523 | struct cpuacct, css); | ||
| 9524 | } | ||
| 9525 | |||
| 9526 | /* return cpu accounting group to which this task belongs */ | ||
| 9527 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
| 9528 | { | ||
| 9529 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
| 9530 | struct cpuacct, css); | ||
| 9531 | } | ||
| 9532 | |||
| 9533 | /* create a new cpu accounting group */ | 7906 | /* create a new cpu accounting group */ |
| 9534 | static struct cgroup_subsys_state *cpuacct_create( | 7907 | static struct cgroup_subsys_state *cpuacct_create( |
| 9535 | struct cgroup_subsys *ss, struct cgroup *cgrp) | 7908 | struct cgroup_subsys *ss, struct cgroup *cgrp) |
| 9536 | { | 7909 | { |
| 9537 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | 7910 | struct cpuacct *ca; |
| 9538 | int i; | 7911 | |
| 7912 | if (!cgrp->parent) | ||
| 7913 | return &root_cpuacct.css; | ||
| 9539 | 7914 | ||
| 7915 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
| 9540 | if (!ca) | 7916 | if (!ca) |
| 9541 | goto out; | 7917 | goto out; |
| 9542 | 7918 | ||
| @@ -9544,18 +7920,13 @@ static struct cgroup_subsys_state *cpuacct_create( | |||
| 9544 | if (!ca->cpuusage) | 7920 | if (!ca->cpuusage) |
| 9545 | goto out_free_ca; | 7921 | goto out_free_ca; |
| 9546 | 7922 | ||
| 9547 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) | 7923 | ca->cpustat = alloc_percpu(struct kernel_cpustat); |
| 9548 | if (percpu_counter_init(&ca->cpustat[i], 0)) | 7924 | if (!ca->cpustat) |
| 9549 | goto out_free_counters; | 7925 | goto out_free_cpuusage; |
| 9550 | |||
| 9551 | if (cgrp->parent) | ||
| 9552 | ca->parent = cgroup_ca(cgrp->parent); | ||
| 9553 | 7926 | ||
| 9554 | return &ca->css; | 7927 | return &ca->css; |
| 9555 | 7928 | ||
| 9556 | out_free_counters: | 7929 | out_free_cpuusage: |
| 9557 | while (--i >= 0) | ||
| 9558 | percpu_counter_destroy(&ca->cpustat[i]); | ||
| 9559 | free_percpu(ca->cpuusage); | 7930 | free_percpu(ca->cpuusage); |
| 9560 | out_free_ca: | 7931 | out_free_ca: |
| 9561 | kfree(ca); | 7932 | kfree(ca); |
| @@ -9568,10 +7939,8 @@ static void | |||
| 9568 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | 7939 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) |
| 9569 | { | 7940 | { |
| 9570 | struct cpuacct *ca = cgroup_ca(cgrp); | 7941 | struct cpuacct *ca = cgroup_ca(cgrp); |
| 9571 | int i; | ||
| 9572 | 7942 | ||
| 9573 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) | 7943 | free_percpu(ca->cpustat); |
| 9574 | percpu_counter_destroy(&ca->cpustat[i]); | ||
| 9575 | free_percpu(ca->cpuusage); | 7944 | free_percpu(ca->cpuusage); |
| 9576 | kfree(ca); | 7945 | kfree(ca); |
| 9577 | } | 7946 | } |
| @@ -9664,16 +8033,31 @@ static const char *cpuacct_stat_desc[] = { | |||
| 9664 | }; | 8033 | }; |
| 9665 | 8034 | ||
| 9666 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | 8035 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, |
| 9667 | struct cgroup_map_cb *cb) | 8036 | struct cgroup_map_cb *cb) |
| 9668 | { | 8037 | { |
| 9669 | struct cpuacct *ca = cgroup_ca(cgrp); | 8038 | struct cpuacct *ca = cgroup_ca(cgrp); |
| 9670 | int i; | 8039 | int cpu; |
| 8040 | s64 val = 0; | ||
| 8041 | |||
| 8042 | for_each_online_cpu(cpu) { | ||
| 8043 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
| 8044 | val += kcpustat->cpustat[CPUTIME_USER]; | ||
| 8045 | val += kcpustat->cpustat[CPUTIME_NICE]; | ||
| 8046 | } | ||
| 8047 | val = cputime64_to_clock_t(val); | ||
| 8048 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); | ||
| 9671 | 8049 | ||
| 9672 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { | 8050 | val = 0; |
| 9673 | s64 val = percpu_counter_read(&ca->cpustat[i]); | 8051 | for_each_online_cpu(cpu) { |
| 9674 | val = cputime64_to_clock_t(val); | 8052 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); |
| 9675 | cb->fill(cb, cpuacct_stat_desc[i], val); | 8053 | val += kcpustat->cpustat[CPUTIME_SYSTEM]; |
| 8054 | val += kcpustat->cpustat[CPUTIME_IRQ]; | ||
| 8055 | val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; | ||
| 9676 | } | 8056 | } |
| 8057 | |||
| 8058 | val = cputime64_to_clock_t(val); | ||
| 8059 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); | ||
| 8060 | |||
| 9677 | return 0; | 8061 | return 0; |
| 9678 | } | 8062 | } |
| 9679 | 8063 | ||
| @@ -9703,7 +8087,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
| 9703 | * | 8087 | * |
| 9704 | * called with rq->lock held. | 8088 | * called with rq->lock held. |
| 9705 | */ | 8089 | */ |
| 9706 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | 8090 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) |
| 9707 | { | 8091 | { |
| 9708 | struct cpuacct *ca; | 8092 | struct cpuacct *ca; |
| 9709 | int cpu; | 8093 | int cpu; |
| @@ -9717,7 +8101,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
| 9717 | 8101 | ||
| 9718 | ca = task_ca(tsk); | 8102 | ca = task_ca(tsk); |
| 9719 | 8103 | ||
| 9720 | for (; ca; ca = ca->parent) { | 8104 | for (; ca; ca = parent_ca(ca)) { |
| 9721 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | 8105 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); |
| 9722 | *cpuusage += cputime; | 8106 | *cpuusage += cputime; |
| 9723 | } | 8107 | } |
| @@ -9725,46 +8109,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
| 9725 | rcu_read_unlock(); | 8109 | rcu_read_unlock(); |
| 9726 | } | 8110 | } |
| 9727 | 8111 | ||
| 9728 | /* | ||
| 9729 | * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large | ||
| 9730 | * in cputime_t units. As a result, cpuacct_update_stats calls | ||
| 9731 | * percpu_counter_add with values large enough to always overflow the | ||
| 9732 | * per cpu batch limit causing bad SMP scalability. | ||
| 9733 | * | ||
| 9734 | * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we | ||
| 9735 | * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled | ||
| 9736 | * and enabled. We cap it at INT_MAX which is the largest allowed batch value. | ||
| 9737 | */ | ||
| 9738 | #ifdef CONFIG_SMP | ||
| 9739 | #define CPUACCT_BATCH \ | ||
| 9740 | min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) | ||
| 9741 | #else | ||
| 9742 | #define CPUACCT_BATCH 0 | ||
| 9743 | #endif | ||
| 9744 | |||
| 9745 | /* | ||
| 9746 | * Charge the system/user time to the task's accounting group. | ||
| 9747 | */ | ||
| 9748 | static void cpuacct_update_stats(struct task_struct *tsk, | ||
| 9749 | enum cpuacct_stat_index idx, cputime_t val) | ||
| 9750 | { | ||
| 9751 | struct cpuacct *ca; | ||
| 9752 | int batch = CPUACCT_BATCH; | ||
| 9753 | |||
| 9754 | if (unlikely(!cpuacct_subsys.active)) | ||
| 9755 | return; | ||
| 9756 | |||
| 9757 | rcu_read_lock(); | ||
| 9758 | ca = task_ca(tsk); | ||
| 9759 | |||
| 9760 | do { | ||
| 9761 | __percpu_counter_add(&ca->cpustat[idx], | ||
| 9762 | (__force s64) val, batch); | ||
| 9763 | ca = ca->parent; | ||
| 9764 | } while (ca); | ||
| 9765 | rcu_read_unlock(); | ||
| 9766 | } | ||
| 9767 | |||
| 9768 | struct cgroup_subsys cpuacct_subsys = { | 8112 | struct cgroup_subsys cpuacct_subsys = { |
| 9769 | .name = "cpuacct", | 8113 | .name = "cpuacct", |
| 9770 | .create = cpuacct_create, | 8114 | .create = cpuacct_create, |
diff --git a/kernel/sched_cpupri.c b/kernel/sched/cpupri.c index a86cf9d9eb11..b0d798eaf130 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched/cpupri.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * kernel/sched_cpupri.c | 2 | * kernel/sched/cpupri.c |
| 3 | * | 3 | * |
| 4 | * CPU priority management | 4 | * CPU priority management |
| 5 | * | 5 | * |
| @@ -28,7 +28,7 @@ | |||
| 28 | */ | 28 | */ |
| 29 | 29 | ||
| 30 | #include <linux/gfp.h> | 30 | #include <linux/gfp.h> |
| 31 | #include "sched_cpupri.h" | 31 | #include "cpupri.h" |
| 32 | 32 | ||
| 33 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ | 33 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ |
| 34 | static int convert_prio(int prio) | 34 | static int convert_prio(int prio) |
diff --git a/kernel/sched_cpupri.h b/kernel/sched/cpupri.h index f6d756173491..f6d756173491 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched/cpupri.h | |||
diff --git a/kernel/sched_debug.c b/kernel/sched/debug.c index a6710a112b4f..2a075e10004b 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched/debug.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * kernel/time/sched_debug.c | 2 | * kernel/sched/debug.c |
| 3 | * | 3 | * |
| 4 | * Print the CFS rbtree | 4 | * Print the CFS rbtree |
| 5 | * | 5 | * |
| @@ -16,6 +16,8 @@ | |||
| 16 | #include <linux/kallsyms.h> | 16 | #include <linux/kallsyms.h> |
| 17 | #include <linux/utsname.h> | 17 | #include <linux/utsname.h> |
| 18 | 18 | ||
| 19 | #include "sched.h" | ||
| 20 | |||
| 19 | static DEFINE_SPINLOCK(sched_debug_lock); | 21 | static DEFINE_SPINLOCK(sched_debug_lock); |
| 20 | 22 | ||
| 21 | /* | 23 | /* |
| @@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
| 373 | return 0; | 375 | return 0; |
| 374 | } | 376 | } |
| 375 | 377 | ||
| 376 | static void sysrq_sched_debug_show(void) | 378 | void sysrq_sched_debug_show(void) |
| 377 | { | 379 | { |
| 378 | sched_debug_show(NULL, NULL); | 380 | sched_debug_show(NULL, NULL); |
| 379 | } | 381 | } |
diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c index a78ed2736ba7..a4d2b7abc3cd 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched/fair.c | |||
| @@ -23,6 +23,13 @@ | |||
| 23 | #include <linux/latencytop.h> | 23 | #include <linux/latencytop.h> |
| 24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
| 25 | #include <linux/cpumask.h> | 25 | #include <linux/cpumask.h> |
| 26 | #include <linux/slab.h> | ||
| 27 | #include <linux/profile.h> | ||
| 28 | #include <linux/interrupt.h> | ||
| 29 | |||
| 30 | #include <trace/events/sched.h> | ||
| 31 | |||
| 32 | #include "sched.h" | ||
| 26 | 33 | ||
| 27 | /* | 34 | /* |
| 28 | * Targeted preemption latency for CPU-bound tasks: | 35 | * Targeted preemption latency for CPU-bound tasks: |
| @@ -103,7 +110,110 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | |||
| 103 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; | 110 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; |
| 104 | #endif | 111 | #endif |
| 105 | 112 | ||
| 106 | static const struct sched_class fair_sched_class; | 113 | /* |
| 114 | * Increase the granularity value when there are more CPUs, | ||
| 115 | * because with more CPUs the 'effective latency' as visible | ||
| 116 | * to users decreases. But the relationship is not linear, | ||
| 117 | * so pick a second-best guess by going with the log2 of the | ||
| 118 | * number of CPUs. | ||
| 119 | * | ||
| 120 | * This idea comes from the SD scheduler of Con Kolivas: | ||
| 121 | */ | ||
| 122 | static int get_update_sysctl_factor(void) | ||
| 123 | { | ||
| 124 | unsigned int cpus = min_t(int, num_online_cpus(), 8); | ||
| 125 | unsigned int factor; | ||
| 126 | |||
| 127 | switch (sysctl_sched_tunable_scaling) { | ||
| 128 | case SCHED_TUNABLESCALING_NONE: | ||
| 129 | factor = 1; | ||
| 130 | break; | ||
| 131 | case SCHED_TUNABLESCALING_LINEAR: | ||
| 132 | factor = cpus; | ||
| 133 | break; | ||
| 134 | case SCHED_TUNABLESCALING_LOG: | ||
| 135 | default: | ||
| 136 | factor = 1 + ilog2(cpus); | ||
| 137 | break; | ||
| 138 | } | ||
| 139 | |||
| 140 | return factor; | ||
| 141 | } | ||
| 142 | |||
| 143 | static void update_sysctl(void) | ||
| 144 | { | ||
| 145 | unsigned int factor = get_update_sysctl_factor(); | ||
| 146 | |||
| 147 | #define SET_SYSCTL(name) \ | ||
| 148 | (sysctl_##name = (factor) * normalized_sysctl_##name) | ||
| 149 | SET_SYSCTL(sched_min_granularity); | ||
| 150 | SET_SYSCTL(sched_latency); | ||
| 151 | SET_SYSCTL(sched_wakeup_granularity); | ||
| 152 | #undef SET_SYSCTL | ||
| 153 | } | ||
| 154 | |||
| 155 | void sched_init_granularity(void) | ||
| 156 | { | ||
| 157 | update_sysctl(); | ||
| 158 | } | ||
| 159 | |||
| 160 | #if BITS_PER_LONG == 32 | ||
| 161 | # define WMULT_CONST (~0UL) | ||
| 162 | #else | ||
| 163 | # define WMULT_CONST (1UL << 32) | ||
| 164 | #endif | ||
| 165 | |||
| 166 | #define WMULT_SHIFT 32 | ||
| 167 | |||
| 168 | /* | ||
| 169 | * Shift right and round: | ||
| 170 | */ | ||
| 171 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | ||
| 172 | |||
| 173 | /* | ||
| 174 | * delta *= weight / lw | ||
| 175 | */ | ||
| 176 | static unsigned long | ||
| 177 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | ||
| 178 | struct load_weight *lw) | ||
| 179 | { | ||
| 180 | u64 tmp; | ||
| 181 | |||
| 182 | /* | ||
| 183 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | ||
| 184 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | ||
| 185 | * 2^SCHED_LOAD_RESOLUTION. | ||
| 186 | */ | ||
| 187 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | ||
| 188 | tmp = (u64)delta_exec * scale_load_down(weight); | ||
| 189 | else | ||
| 190 | tmp = (u64)delta_exec; | ||
| 191 | |||
| 192 | if (!lw->inv_weight) { | ||
| 193 | unsigned long w = scale_load_down(lw->weight); | ||
| 194 | |||
| 195 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | ||
| 196 | lw->inv_weight = 1; | ||
| 197 | else if (unlikely(!w)) | ||
| 198 | lw->inv_weight = WMULT_CONST; | ||
| 199 | else | ||
| 200 | lw->inv_weight = WMULT_CONST / w; | ||
| 201 | } | ||
| 202 | |||
| 203 | /* | ||
| 204 | * Check whether we'd overflow the 64-bit multiplication: | ||
| 205 | */ | ||
| 206 | if (unlikely(tmp > WMULT_CONST)) | ||
| 207 | tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, | ||
| 208 | WMULT_SHIFT/2); | ||
| 209 | else | ||
| 210 | tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); | ||
| 211 | |||
| 212 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | ||
| 213 | } | ||
| 214 | |||
| 215 | |||
| 216 | const struct sched_class fair_sched_class; | ||
| 107 | 217 | ||
| 108 | /************************************************************** | 218 | /************************************************************** |
| 109 | * CFS operations on generic schedulable entities: | 219 | * CFS operations on generic schedulable entities: |
| @@ -413,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 413 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | 523 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
| 414 | } | 524 | } |
| 415 | 525 | ||
| 416 | static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) | 526 | struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) |
| 417 | { | 527 | { |
| 418 | struct rb_node *left = cfs_rq->rb_leftmost; | 528 | struct rb_node *left = cfs_rq->rb_leftmost; |
| 419 | 529 | ||
| @@ -434,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) | |||
| 434 | } | 544 | } |
| 435 | 545 | ||
| 436 | #ifdef CONFIG_SCHED_DEBUG | 546 | #ifdef CONFIG_SCHED_DEBUG |
| 437 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | 547 | struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) |
| 438 | { | 548 | { |
| 439 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); | 549 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); |
| 440 | 550 | ||
| @@ -684,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 684 | { | 794 | { |
| 685 | update_load_add(&cfs_rq->load, se->load.weight); | 795 | update_load_add(&cfs_rq->load, se->load.weight); |
| 686 | if (!parent_entity(se)) | 796 | if (!parent_entity(se)) |
| 687 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | 797 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
| 688 | if (entity_is_task(se)) { | 798 | if (entity_is_task(se)) { |
| 689 | add_cfs_task_weight(cfs_rq, se->load.weight); | 799 | add_cfs_task_weight(cfs_rq, se->load.weight); |
| 690 | list_add(&se->group_node, &cfs_rq->tasks); | 800 | list_add(&se->group_node, &cfs_rq->tasks); |
| @@ -697,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 697 | { | 807 | { |
| 698 | update_load_sub(&cfs_rq->load, se->load.weight); | 808 | update_load_sub(&cfs_rq->load, se->load.weight); |
| 699 | if (!parent_entity(se)) | 809 | if (!parent_entity(se)) |
| 700 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | 810 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); |
| 701 | if (entity_is_task(se)) { | 811 | if (entity_is_task(se)) { |
| 702 | add_cfs_task_weight(cfs_rq, -se->load.weight); | 812 | add_cfs_task_weight(cfs_rq, -se->load.weight); |
| 703 | list_del_init(&se->group_node); | 813 | list_del_init(&se->group_node); |
| @@ -920,6 +1030,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 920 | trace_sched_stat_iowait(tsk, delta); | 1030 | trace_sched_stat_iowait(tsk, delta); |
| 921 | } | 1031 | } |
| 922 | 1032 | ||
| 1033 | trace_sched_stat_blocked(tsk, delta); | ||
| 1034 | |||
| 923 | /* | 1035 | /* |
| 924 | * Blocking time is in units of nanosecs, so shift by | 1036 | * Blocking time is in units of nanosecs, so shift by |
| 925 | * 20 to get a milliseconds-range estimation of the | 1037 | * 20 to get a milliseconds-range estimation of the |
| @@ -1287,6 +1399,32 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
| 1287 | */ | 1399 | */ |
| 1288 | 1400 | ||
| 1289 | #ifdef CONFIG_CFS_BANDWIDTH | 1401 | #ifdef CONFIG_CFS_BANDWIDTH |
| 1402 | |||
| 1403 | #ifdef HAVE_JUMP_LABEL | ||
| 1404 | static struct jump_label_key __cfs_bandwidth_used; | ||
| 1405 | |||
| 1406 | static inline bool cfs_bandwidth_used(void) | ||
| 1407 | { | ||
| 1408 | return static_branch(&__cfs_bandwidth_used); | ||
| 1409 | } | ||
| 1410 | |||
| 1411 | void account_cfs_bandwidth_used(int enabled, int was_enabled) | ||
| 1412 | { | ||
| 1413 | /* only need to count groups transitioning between enabled/!enabled */ | ||
| 1414 | if (enabled && !was_enabled) | ||
| 1415 | jump_label_inc(&__cfs_bandwidth_used); | ||
| 1416 | else if (!enabled && was_enabled) | ||
| 1417 | jump_label_dec(&__cfs_bandwidth_used); | ||
| 1418 | } | ||
| 1419 | #else /* HAVE_JUMP_LABEL */ | ||
| 1420 | static bool cfs_bandwidth_used(void) | ||
| 1421 | { | ||
| 1422 | return true; | ||
| 1423 | } | ||
| 1424 | |||
| 1425 | void account_cfs_bandwidth_used(int enabled, int was_enabled) {} | ||
| 1426 | #endif /* HAVE_JUMP_LABEL */ | ||
| 1427 | |||
| 1290 | /* | 1428 | /* |
| 1291 | * default period for cfs group bandwidth. | 1429 | * default period for cfs group bandwidth. |
| 1292 | * default: 0.1s, units: nanoseconds | 1430 | * default: 0.1s, units: nanoseconds |
| @@ -1308,7 +1446,7 @@ static inline u64 sched_cfs_bandwidth_slice(void) | |||
| 1308 | * | 1446 | * |
| 1309 | * requires cfs_b->lock | 1447 | * requires cfs_b->lock |
| 1310 | */ | 1448 | */ |
| 1311 | static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) | 1449 | void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) |
| 1312 | { | 1450 | { |
| 1313 | u64 now; | 1451 | u64 now; |
| 1314 | 1452 | ||
| @@ -1320,6 +1458,11 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) | |||
| 1320 | cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); | 1458 | cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); |
| 1321 | } | 1459 | } |
| 1322 | 1460 | ||
| 1461 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
| 1462 | { | ||
| 1463 | return &tg->cfs_bandwidth; | ||
| 1464 | } | ||
| 1465 | |||
| 1323 | /* returns 0 on failure to allocate runtime */ | 1466 | /* returns 0 on failure to allocate runtime */ |
| 1324 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 1467 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
| 1325 | { | 1468 | { |
| @@ -1421,7 +1564,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | |||
| 1421 | static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 1564 | static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, |
| 1422 | unsigned long delta_exec) | 1565 | unsigned long delta_exec) |
| 1423 | { | 1566 | { |
| 1424 | if (!cfs_rq->runtime_enabled) | 1567 | if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) |
| 1425 | return; | 1568 | return; |
| 1426 | 1569 | ||
| 1427 | __account_cfs_rq_runtime(cfs_rq, delta_exec); | 1570 | __account_cfs_rq_runtime(cfs_rq, delta_exec); |
| @@ -1429,13 +1572,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | |||
| 1429 | 1572 | ||
| 1430 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | 1573 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) |
| 1431 | { | 1574 | { |
| 1432 | return cfs_rq->throttled; | 1575 | return cfs_bandwidth_used() && cfs_rq->throttled; |
| 1433 | } | 1576 | } |
| 1434 | 1577 | ||
| 1435 | /* check whether cfs_rq, or any parent, is throttled */ | 1578 | /* check whether cfs_rq, or any parent, is throttled */ |
| 1436 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) | 1579 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) |
| 1437 | { | 1580 | { |
| 1438 | return cfs_rq->throttle_count; | 1581 | return cfs_bandwidth_used() && cfs_rq->throttle_count; |
| 1439 | } | 1582 | } |
| 1440 | 1583 | ||
| 1441 | /* | 1584 | /* |
| @@ -1530,7 +1673,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 1530 | raw_spin_unlock(&cfs_b->lock); | 1673 | raw_spin_unlock(&cfs_b->lock); |
| 1531 | } | 1674 | } |
| 1532 | 1675 | ||
| 1533 | static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | 1676 | void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) |
| 1534 | { | 1677 | { |
| 1535 | struct rq *rq = rq_of(cfs_rq); | 1678 | struct rq *rq = rq_of(cfs_rq); |
| 1536 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | 1679 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); |
| @@ -1756,6 +1899,9 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
| 1756 | 1899 | ||
| 1757 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 1900 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
| 1758 | { | 1901 | { |
| 1902 | if (!cfs_bandwidth_used()) | ||
| 1903 | return; | ||
| 1904 | |||
| 1759 | if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) | 1905 | if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) |
| 1760 | return; | 1906 | return; |
| 1761 | 1907 | ||
| @@ -1801,6 +1947,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | |||
| 1801 | */ | 1947 | */ |
| 1802 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | 1948 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) |
| 1803 | { | 1949 | { |
| 1950 | if (!cfs_bandwidth_used()) | ||
| 1951 | return; | ||
| 1952 | |||
| 1804 | /* an active group must be handled by the update_curr()->put() path */ | 1953 | /* an active group must be handled by the update_curr()->put() path */ |
| 1805 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) | 1954 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) |
| 1806 | return; | 1955 | return; |
| @@ -1818,6 +1967,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | |||
| 1818 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ | 1967 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ |
| 1819 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 1968 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
| 1820 | { | 1969 | { |
| 1970 | if (!cfs_bandwidth_used()) | ||
| 1971 | return; | ||
| 1972 | |||
| 1821 | if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) | 1973 | if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) |
| 1822 | return; | 1974 | return; |
| 1823 | 1975 | ||
| @@ -1830,7 +1982,112 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
| 1830 | 1982 | ||
| 1831 | throttle_cfs_rq(cfs_rq); | 1983 | throttle_cfs_rq(cfs_rq); |
| 1832 | } | 1984 | } |
| 1833 | #else | 1985 | |
| 1986 | static inline u64 default_cfs_period(void); | ||
| 1987 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); | ||
| 1988 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); | ||
| 1989 | |||
| 1990 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | ||
| 1991 | { | ||
| 1992 | struct cfs_bandwidth *cfs_b = | ||
| 1993 | container_of(timer, struct cfs_bandwidth, slack_timer); | ||
| 1994 | do_sched_cfs_slack_timer(cfs_b); | ||
| 1995 | |||
| 1996 | return HRTIMER_NORESTART; | ||
| 1997 | } | ||
| 1998 | |||
| 1999 | static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | ||
| 2000 | { | ||
| 2001 | struct cfs_bandwidth *cfs_b = | ||
| 2002 | container_of(timer, struct cfs_bandwidth, period_timer); | ||
| 2003 | ktime_t now; | ||
| 2004 | int overrun; | ||
| 2005 | int idle = 0; | ||
| 2006 | |||
| 2007 | for (;;) { | ||
| 2008 | now = hrtimer_cb_get_time(timer); | ||
| 2009 | overrun = hrtimer_forward(timer, now, cfs_b->period); | ||
| 2010 | |||
| 2011 | if (!overrun) | ||
| 2012 | break; | ||
| 2013 | |||
| 2014 | idle = do_sched_cfs_period_timer(cfs_b, overrun); | ||
| 2015 | } | ||
| 2016 | |||
| 2017 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
| 2018 | } | ||
| 2019 | |||
| 2020 | void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
| 2021 | { | ||
| 2022 | raw_spin_lock_init(&cfs_b->lock); | ||
| 2023 | cfs_b->runtime = 0; | ||
| 2024 | cfs_b->quota = RUNTIME_INF; | ||
| 2025 | cfs_b->period = ns_to_ktime(default_cfs_period()); | ||
| 2026 | |||
| 2027 | INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); | ||
| 2028 | hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 2029 | cfs_b->period_timer.function = sched_cfs_period_timer; | ||
| 2030 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 2031 | cfs_b->slack_timer.function = sched_cfs_slack_timer; | ||
| 2032 | } | ||
| 2033 | |||
| 2034 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
| 2035 | { | ||
| 2036 | cfs_rq->runtime_enabled = 0; | ||
| 2037 | INIT_LIST_HEAD(&cfs_rq->throttled_list); | ||
| 2038 | } | ||
| 2039 | |||
| 2040 | /* requires cfs_b->lock, may release to reprogram timer */ | ||
| 2041 | void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
| 2042 | { | ||
| 2043 | /* | ||
| 2044 | * The timer may be active because we're trying to set a new bandwidth | ||
| 2045 | * period or because we're racing with the tear-down path | ||
| 2046 | * (timer_active==0 becomes visible before the hrtimer call-back | ||
| 2047 | * terminates). In either case we ensure that it's re-programmed | ||
| 2048 | */ | ||
| 2049 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | ||
| 2050 | raw_spin_unlock(&cfs_b->lock); | ||
| 2051 | /* ensure cfs_b->lock is available while we wait */ | ||
| 2052 | hrtimer_cancel(&cfs_b->period_timer); | ||
| 2053 | |||
| 2054 | raw_spin_lock(&cfs_b->lock); | ||
| 2055 | /* if someone else restarted the timer then we're done */ | ||
| 2056 | if (cfs_b->timer_active) | ||
| 2057 | return; | ||
| 2058 | } | ||
| 2059 | |||
| 2060 | cfs_b->timer_active = 1; | ||
| 2061 | start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); | ||
| 2062 | } | ||
| 2063 | |||
| 2064 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
| 2065 | { | ||
| 2066 | hrtimer_cancel(&cfs_b->period_timer); | ||
| 2067 | hrtimer_cancel(&cfs_b->slack_timer); | ||
| 2068 | } | ||
| 2069 | |||
| 2070 | void unthrottle_offline_cfs_rqs(struct rq *rq) | ||
| 2071 | { | ||
| 2072 | struct cfs_rq *cfs_rq; | ||
| 2073 | |||
| 2074 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
| 2075 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
| 2076 | |||
| 2077 | if (!cfs_rq->runtime_enabled) | ||
| 2078 | continue; | ||
| 2079 | |||
| 2080 | /* | ||
| 2081 | * clock_task is not advancing so we just need to make sure | ||
| 2082 | * there's some valid quota amount | ||
| 2083 | */ | ||
| 2084 | cfs_rq->runtime_remaining = cfs_b->quota; | ||
| 2085 | if (cfs_rq_throttled(cfs_rq)) | ||
| 2086 | unthrottle_cfs_rq(cfs_rq); | ||
| 2087 | } | ||
| 2088 | } | ||
| 2089 | |||
| 2090 | #else /* CONFIG_CFS_BANDWIDTH */ | ||
| 1834 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | 2091 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, |
| 1835 | unsigned long delta_exec) {} | 2092 | unsigned long delta_exec) {} |
| 1836 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | 2093 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} |
| @@ -1852,8 +2109,22 @@ static inline int throttled_lb_pair(struct task_group *tg, | |||
| 1852 | { | 2109 | { |
| 1853 | return 0; | 2110 | return 0; |
| 1854 | } | 2111 | } |
| 2112 | |||
| 2113 | void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
| 2114 | |||
| 2115 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 2116 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
| 1855 | #endif | 2117 | #endif |
| 1856 | 2118 | ||
| 2119 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
| 2120 | { | ||
| 2121 | return NULL; | ||
| 2122 | } | ||
| 2123 | static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
| 2124 | void unthrottle_offline_cfs_rqs(struct rq *rq) {} | ||
| 2125 | |||
| 2126 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
| 2127 | |||
| 1857 | /************************************************** | 2128 | /************************************************** |
| 1858 | * CFS operations on tasks: | 2129 | * CFS operations on tasks: |
| 1859 | */ | 2130 | */ |
| @@ -1866,7 +2137,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
| 1866 | 2137 | ||
| 1867 | WARN_ON(task_rq(p) != rq); | 2138 | WARN_ON(task_rq(p) != rq); |
| 1868 | 2139 | ||
| 1869 | if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { | 2140 | if (cfs_rq->nr_running > 1) { |
| 1870 | u64 slice = sched_slice(cfs_rq, se); | 2141 | u64 slice = sched_slice(cfs_rq, se); |
| 1871 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; | 2142 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; |
| 1872 | s64 delta = slice - ran; | 2143 | s64 delta = slice - ran; |
| @@ -1897,7 +2168,7 @@ static void hrtick_update(struct rq *rq) | |||
| 1897 | { | 2168 | { |
| 1898 | struct task_struct *curr = rq->curr; | 2169 | struct task_struct *curr = rq->curr; |
| 1899 | 2170 | ||
| 1900 | if (curr->sched_class != &fair_sched_class) | 2171 | if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) |
| 1901 | return; | 2172 | return; |
| 1902 | 2173 | ||
| 1903 | if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) | 2174 | if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) |
| @@ -2020,6 +2291,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 2020 | } | 2291 | } |
| 2021 | 2292 | ||
| 2022 | #ifdef CONFIG_SMP | 2293 | #ifdef CONFIG_SMP |
| 2294 | /* Used instead of source_load when we know the type == 0 */ | ||
| 2295 | static unsigned long weighted_cpuload(const int cpu) | ||
| 2296 | { | ||
| 2297 | return cpu_rq(cpu)->load.weight; | ||
| 2298 | } | ||
| 2299 | |||
| 2300 | /* | ||
| 2301 | * Return a low guess at the load of a migration-source cpu weighted | ||
| 2302 | * according to the scheduling class and "nice" value. | ||
| 2303 | * | ||
| 2304 | * We want to under-estimate the load of migration sources, to | ||
| 2305 | * balance conservatively. | ||
| 2306 | */ | ||
| 2307 | static unsigned long source_load(int cpu, int type) | ||
| 2308 | { | ||
| 2309 | struct rq *rq = cpu_rq(cpu); | ||
| 2310 | unsigned long total = weighted_cpuload(cpu); | ||
| 2311 | |||
| 2312 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
| 2313 | return total; | ||
| 2314 | |||
| 2315 | return min(rq->cpu_load[type-1], total); | ||
| 2316 | } | ||
| 2317 | |||
| 2318 | /* | ||
| 2319 | * Return a high guess at the load of a migration-target cpu weighted | ||
| 2320 | * according to the scheduling class and "nice" value. | ||
| 2321 | */ | ||
| 2322 | static unsigned long target_load(int cpu, int type) | ||
| 2323 | { | ||
| 2324 | struct rq *rq = cpu_rq(cpu); | ||
| 2325 | unsigned long total = weighted_cpuload(cpu); | ||
| 2326 | |||
| 2327 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
| 2328 | return total; | ||
| 2329 | |||
| 2330 | return max(rq->cpu_load[type-1], total); | ||
| 2331 | } | ||
| 2332 | |||
| 2333 | static unsigned long power_of(int cpu) | ||
| 2334 | { | ||
| 2335 | return cpu_rq(cpu)->cpu_power; | ||
| 2336 | } | ||
| 2337 | |||
| 2338 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
| 2339 | { | ||
| 2340 | struct rq *rq = cpu_rq(cpu); | ||
| 2341 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | ||
| 2342 | |||
| 2343 | if (nr_running) | ||
| 2344 | return rq->load.weight / nr_running; | ||
| 2345 | |||
| 2346 | return 0; | ||
| 2347 | } | ||
| 2348 | |||
| 2023 | 2349 | ||
| 2024 | static void task_waking_fair(struct task_struct *p) | 2350 | static void task_waking_fair(struct task_struct *p) |
| 2025 | { | 2351 | { |
| @@ -2318,6 +2644,28 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
| 2318 | return idlest; | 2644 | return idlest; |
| 2319 | } | 2645 | } |
| 2320 | 2646 | ||
| 2647 | /** | ||
| 2648 | * highest_flag_domain - Return highest sched_domain containing flag. | ||
| 2649 | * @cpu: The cpu whose highest level of sched domain is to | ||
| 2650 | * be returned. | ||
| 2651 | * @flag: The flag to check for the highest sched_domain | ||
| 2652 | * for the given cpu. | ||
| 2653 | * | ||
| 2654 | * Returns the highest sched_domain of a cpu which contains the given flag. | ||
| 2655 | */ | ||
| 2656 | static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | ||
| 2657 | { | ||
| 2658 | struct sched_domain *sd, *hsd = NULL; | ||
| 2659 | |||
| 2660 | for_each_domain(cpu, sd) { | ||
| 2661 | if (!(sd->flags & flag)) | ||
| 2662 | break; | ||
| 2663 | hsd = sd; | ||
| 2664 | } | ||
| 2665 | |||
| 2666 | return hsd; | ||
| 2667 | } | ||
| 2668 | |||
| 2321 | /* | 2669 | /* |
| 2322 | * Try and locate an idle CPU in the sched_domain. | 2670 | * Try and locate an idle CPU in the sched_domain. |
| 2323 | */ | 2671 | */ |
| @@ -2327,7 +2675,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
| 2327 | int prev_cpu = task_cpu(p); | 2675 | int prev_cpu = task_cpu(p); |
| 2328 | struct sched_domain *sd; | 2676 | struct sched_domain *sd; |
| 2329 | struct sched_group *sg; | 2677 | struct sched_group *sg; |
| 2330 | int i, smt = 0; | 2678 | int i; |
| 2331 | 2679 | ||
| 2332 | /* | 2680 | /* |
| 2333 | * If the task is going to be woken-up on this cpu and if it is | 2681 | * If the task is going to be woken-up on this cpu and if it is |
| @@ -2347,19 +2695,9 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
| 2347 | * Otherwise, iterate the domains and find an elegible idle cpu. | 2695 | * Otherwise, iterate the domains and find an elegible idle cpu. |
| 2348 | */ | 2696 | */ |
| 2349 | rcu_read_lock(); | 2697 | rcu_read_lock(); |
| 2350 | again: | ||
| 2351 | for_each_domain(target, sd) { | ||
| 2352 | if (!smt && (sd->flags & SD_SHARE_CPUPOWER)) | ||
| 2353 | continue; | ||
| 2354 | |||
| 2355 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) { | ||
| 2356 | if (!smt) { | ||
| 2357 | smt = 1; | ||
| 2358 | goto again; | ||
| 2359 | } | ||
| 2360 | break; | ||
| 2361 | } | ||
| 2362 | 2698 | ||
| 2699 | sd = highest_flag_domain(target, SD_SHARE_PKG_RESOURCES); | ||
| 2700 | for_each_lower_domain(sd) { | ||
| 2363 | sg = sd->groups; | 2701 | sg = sd->groups; |
| 2364 | do { | 2702 | do { |
| 2365 | if (!cpumask_intersects(sched_group_cpus(sg), | 2703 | if (!cpumask_intersects(sched_group_cpus(sg), |
| @@ -2406,6 +2744,9 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
| 2406 | int want_sd = 1; | 2744 | int want_sd = 1; |
| 2407 | int sync = wake_flags & WF_SYNC; | 2745 | int sync = wake_flags & WF_SYNC; |
| 2408 | 2746 | ||
| 2747 | if (p->rt.nr_cpus_allowed == 1) | ||
| 2748 | return prev_cpu; | ||
| 2749 | |||
| 2409 | if (sd_flag & SD_BALANCE_WAKE) { | 2750 | if (sd_flag & SD_BALANCE_WAKE) { |
| 2410 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) | 2751 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) |
| 2411 | want_affine = 1; | 2752 | want_affine = 1; |
| @@ -2690,7 +3031,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) | |||
| 2690 | } while (cfs_rq); | 3031 | } while (cfs_rq); |
| 2691 | 3032 | ||
| 2692 | p = task_of(se); | 3033 | p = task_of(se); |
| 2693 | hrtick_start_fair(rq, p); | 3034 | if (hrtick_enabled(rq)) |
| 3035 | hrtick_start_fair(rq, p); | ||
| 2694 | 3036 | ||
| 2695 | return p; | 3037 | return p; |
| 2696 | } | 3038 | } |
| @@ -2734,6 +3076,12 @@ static void yield_task_fair(struct rq *rq) | |||
| 2734 | * Update run-time statistics of the 'current'. | 3076 | * Update run-time statistics of the 'current'. |
| 2735 | */ | 3077 | */ |
| 2736 | update_curr(cfs_rq); | 3078 | update_curr(cfs_rq); |
| 3079 | /* | ||
| 3080 | * Tell update_rq_clock() that we've just updated, | ||
| 3081 | * so we don't do microscopic update in schedule() | ||
| 3082 | * and double the fastpath cost. | ||
| 3083 | */ | ||
| 3084 | rq->skip_clock_update = 1; | ||
| 2737 | } | 3085 | } |
| 2738 | 3086 | ||
| 2739 | set_skip_buddy(se); | 3087 | set_skip_buddy(se); |
| @@ -2774,6 +3122,38 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
| 2774 | } | 3122 | } |
| 2775 | 3123 | ||
| 2776 | /* | 3124 | /* |
| 3125 | * Is this task likely cache-hot: | ||
| 3126 | */ | ||
| 3127 | static int | ||
| 3128 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | ||
| 3129 | { | ||
| 3130 | s64 delta; | ||
| 3131 | |||
| 3132 | if (p->sched_class != &fair_sched_class) | ||
| 3133 | return 0; | ||
| 3134 | |||
| 3135 | if (unlikely(p->policy == SCHED_IDLE)) | ||
| 3136 | return 0; | ||
| 3137 | |||
| 3138 | /* | ||
| 3139 | * Buddy candidates are cache hot: | ||
| 3140 | */ | ||
| 3141 | if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && | ||
| 3142 | (&p->se == cfs_rq_of(&p->se)->next || | ||
| 3143 | &p->se == cfs_rq_of(&p->se)->last)) | ||
| 3144 | return 1; | ||
| 3145 | |||
| 3146 | if (sysctl_sched_migration_cost == -1) | ||
| 3147 | return 1; | ||
| 3148 | if (sysctl_sched_migration_cost == 0) | ||
| 3149 | return 0; | ||
| 3150 | |||
| 3151 | delta = now - p->se.exec_start; | ||
| 3152 | |||
| 3153 | return delta < (s64)sysctl_sched_migration_cost; | ||
| 3154 | } | ||
| 3155 | |||
| 3156 | /* | ||
| 2777 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 3157 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
| 2778 | */ | 3158 | */ |
| 2779 | static | 3159 | static |
| @@ -3153,15 +3533,6 @@ struct sg_lb_stats { | |||
| 3153 | }; | 3533 | }; |
| 3154 | 3534 | ||
| 3155 | /** | 3535 | /** |
| 3156 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | ||
| 3157 | * @group: The group whose first cpu is to be returned. | ||
| 3158 | */ | ||
| 3159 | static inline unsigned int group_first_cpu(struct sched_group *group) | ||
| 3160 | { | ||
| 3161 | return cpumask_first(sched_group_cpus(group)); | ||
| 3162 | } | ||
| 3163 | |||
| 3164 | /** | ||
| 3165 | * get_sd_load_idx - Obtain the load index for a given sched domain. | 3536 | * get_sd_load_idx - Obtain the load index for a given sched domain. |
| 3166 | * @sd: The sched_domain whose load_idx is to be obtained. | 3537 | * @sd: The sched_domain whose load_idx is to be obtained. |
| 3167 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | 3538 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. |
| @@ -3410,7 +3781,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
| 3410 | sdg->sgp->power = power; | 3781 | sdg->sgp->power = power; |
| 3411 | } | 3782 | } |
| 3412 | 3783 | ||
| 3413 | static void update_group_power(struct sched_domain *sd, int cpu) | 3784 | void update_group_power(struct sched_domain *sd, int cpu) |
| 3414 | { | 3785 | { |
| 3415 | struct sched_domain *child = sd->child; | 3786 | struct sched_domain *child = sd->child; |
| 3416 | struct sched_group *group, *sdg = sd->groups; | 3787 | struct sched_group *group, *sdg = sd->groups; |
| @@ -3676,11 +4047,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 3676 | } while (sg != sd->groups); | 4047 | } while (sg != sd->groups); |
| 3677 | } | 4048 | } |
| 3678 | 4049 | ||
| 3679 | int __weak arch_sd_sibling_asym_packing(void) | ||
| 3680 | { | ||
| 3681 | return 0*SD_ASYM_PACKING; | ||
| 3682 | } | ||
| 3683 | |||
| 3684 | /** | 4050 | /** |
| 3685 | * check_asym_packing - Check to see if the group is packed into the | 4051 | * check_asym_packing - Check to see if the group is packed into the |
| 3686 | * sched doman. | 4052 | * sched doman. |
| @@ -4044,7 +4410,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
| 4044 | #define MAX_PINNED_INTERVAL 512 | 4410 | #define MAX_PINNED_INTERVAL 512 |
| 4045 | 4411 | ||
| 4046 | /* Working cpumask for load_balance and load_balance_newidle. */ | 4412 | /* Working cpumask for load_balance and load_balance_newidle. */ |
| 4047 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 4413 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
| 4048 | 4414 | ||
| 4049 | static int need_active_balance(struct sched_domain *sd, int idle, | 4415 | static int need_active_balance(struct sched_domain *sd, int idle, |
| 4050 | int busiest_cpu, int this_cpu) | 4416 | int busiest_cpu, int this_cpu) |
| @@ -4247,7 +4613,7 @@ out: | |||
| 4247 | * idle_balance is called by schedule() if this_cpu is about to become | 4613 | * idle_balance is called by schedule() if this_cpu is about to become |
| 4248 | * idle. Attempts to pull tasks from other CPUs. | 4614 | * idle. Attempts to pull tasks from other CPUs. |
| 4249 | */ | 4615 | */ |
| 4250 | static void idle_balance(int this_cpu, struct rq *this_rq) | 4616 | void idle_balance(int this_cpu, struct rq *this_rq) |
| 4251 | { | 4617 | { |
| 4252 | struct sched_domain *sd; | 4618 | struct sched_domain *sd; |
| 4253 | int pulled_task = 0; | 4619 | int pulled_task = 0; |
| @@ -4362,28 +4728,16 @@ out_unlock: | |||
| 4362 | #ifdef CONFIG_NO_HZ | 4728 | #ifdef CONFIG_NO_HZ |
| 4363 | /* | 4729 | /* |
| 4364 | * idle load balancing details | 4730 | * idle load balancing details |
| 4365 | * - One of the idle CPUs nominates itself as idle load_balancer, while | ||
| 4366 | * entering idle. | ||
| 4367 | * - This idle load balancer CPU will also go into tickless mode when | ||
| 4368 | * it is idle, just like all other idle CPUs | ||
| 4369 | * - When one of the busy CPUs notice that there may be an idle rebalancing | 4731 | * - When one of the busy CPUs notice that there may be an idle rebalancing |
| 4370 | * needed, they will kick the idle load balancer, which then does idle | 4732 | * needed, they will kick the idle load balancer, which then does idle |
| 4371 | * load balancing for all the idle CPUs. | 4733 | * load balancing for all the idle CPUs. |
| 4372 | */ | 4734 | */ |
| 4373 | static struct { | 4735 | static struct { |
| 4374 | atomic_t load_balancer; | ||
| 4375 | atomic_t first_pick_cpu; | ||
| 4376 | atomic_t second_pick_cpu; | ||
| 4377 | cpumask_var_t idle_cpus_mask; | 4736 | cpumask_var_t idle_cpus_mask; |
| 4378 | cpumask_var_t grp_idle_mask; | 4737 | atomic_t nr_cpus; |
| 4379 | unsigned long next_balance; /* in jiffy units */ | 4738 | unsigned long next_balance; /* in jiffy units */ |
| 4380 | } nohz ____cacheline_aligned; | 4739 | } nohz ____cacheline_aligned; |
| 4381 | 4740 | ||
| 4382 | int get_nohz_load_balancer(void) | ||
| 4383 | { | ||
| 4384 | return atomic_read(&nohz.load_balancer); | ||
| 4385 | } | ||
| 4386 | |||
| 4387 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 4741 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
| 4388 | /** | 4742 | /** |
| 4389 | * lowest_flag_domain - Return lowest sched_domain containing flag. | 4743 | * lowest_flag_domain - Return lowest sched_domain containing flag. |
| @@ -4420,33 +4774,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
| 4420 | (sd && (sd->flags & flag)); sd = sd->parent) | 4774 | (sd && (sd->flags & flag)); sd = sd->parent) |
| 4421 | 4775 | ||
| 4422 | /** | 4776 | /** |
| 4423 | * is_semi_idle_group - Checks if the given sched_group is semi-idle. | ||
| 4424 | * @ilb_group: group to be checked for semi-idleness | ||
| 4425 | * | ||
| 4426 | * Returns: 1 if the group is semi-idle. 0 otherwise. | ||
| 4427 | * | ||
| 4428 | * We define a sched_group to be semi idle if it has atleast one idle-CPU | ||
| 4429 | * and atleast one non-idle CPU. This helper function checks if the given | ||
| 4430 | * sched_group is semi-idle or not. | ||
| 4431 | */ | ||
| 4432 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | ||
| 4433 | { | ||
| 4434 | cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, | ||
| 4435 | sched_group_cpus(ilb_group)); | ||
| 4436 | |||
| 4437 | /* | ||
| 4438 | * A sched_group is semi-idle when it has atleast one busy cpu | ||
| 4439 | * and atleast one idle cpu. | ||
| 4440 | */ | ||
| 4441 | if (cpumask_empty(nohz.grp_idle_mask)) | ||
| 4442 | return 0; | ||
| 4443 | |||
| 4444 | if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) | ||
| 4445 | return 0; | ||
| 4446 | |||
| 4447 | return 1; | ||
| 4448 | } | ||
| 4449 | /** | ||
| 4450 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | 4777 | * find_new_ilb - Finds the optimum idle load balancer for nomination. |
| 4451 | * @cpu: The cpu which is nominating a new idle_load_balancer. | 4778 | * @cpu: The cpu which is nominating a new idle_load_balancer. |
| 4452 | * | 4779 | * |
| @@ -4460,9 +4787,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group) | |||
| 4460 | */ | 4787 | */ |
| 4461 | static int find_new_ilb(int cpu) | 4788 | static int find_new_ilb(int cpu) |
| 4462 | { | 4789 | { |
| 4790 | int ilb = cpumask_first(nohz.idle_cpus_mask); | ||
| 4791 | struct sched_group *ilbg; | ||
| 4463 | struct sched_domain *sd; | 4792 | struct sched_domain *sd; |
| 4464 | struct sched_group *ilb_group; | ||
| 4465 | int ilb = nr_cpu_ids; | ||
| 4466 | 4793 | ||
| 4467 | /* | 4794 | /* |
| 4468 | * Have idle load balancer selection from semi-idle packages only | 4795 | * Have idle load balancer selection from semi-idle packages only |
| @@ -4480,23 +4807,28 @@ static int find_new_ilb(int cpu) | |||
| 4480 | 4807 | ||
| 4481 | rcu_read_lock(); | 4808 | rcu_read_lock(); |
| 4482 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 4809 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
| 4483 | ilb_group = sd->groups; | 4810 | ilbg = sd->groups; |
| 4484 | 4811 | ||
| 4485 | do { | 4812 | do { |
| 4486 | if (is_semi_idle_group(ilb_group)) { | 4813 | if (ilbg->group_weight != |
| 4487 | ilb = cpumask_first(nohz.grp_idle_mask); | 4814 | atomic_read(&ilbg->sgp->nr_busy_cpus)) { |
| 4815 | ilb = cpumask_first_and(nohz.idle_cpus_mask, | ||
| 4816 | sched_group_cpus(ilbg)); | ||
| 4488 | goto unlock; | 4817 | goto unlock; |
| 4489 | } | 4818 | } |
| 4490 | 4819 | ||
| 4491 | ilb_group = ilb_group->next; | 4820 | ilbg = ilbg->next; |
| 4492 | 4821 | ||
| 4493 | } while (ilb_group != sd->groups); | 4822 | } while (ilbg != sd->groups); |
| 4494 | } | 4823 | } |
| 4495 | unlock: | 4824 | unlock: |
| 4496 | rcu_read_unlock(); | 4825 | rcu_read_unlock(); |
| 4497 | 4826 | ||
| 4498 | out_done: | 4827 | out_done: |
| 4499 | return ilb; | 4828 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) |
| 4829 | return ilb; | ||
| 4830 | |||
| 4831 | return nr_cpu_ids; | ||
| 4500 | } | 4832 | } |
| 4501 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 4833 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
| 4502 | static inline int find_new_ilb(int call_cpu) | 4834 | static inline int find_new_ilb(int call_cpu) |
| @@ -4516,99 +4848,68 @@ static void nohz_balancer_kick(int cpu) | |||
| 4516 | 4848 | ||
| 4517 | nohz.next_balance++; | 4849 | nohz.next_balance++; |
| 4518 | 4850 | ||
| 4519 | ilb_cpu = get_nohz_load_balancer(); | 4851 | ilb_cpu = find_new_ilb(cpu); |
| 4520 | |||
| 4521 | if (ilb_cpu >= nr_cpu_ids) { | ||
| 4522 | ilb_cpu = cpumask_first(nohz.idle_cpus_mask); | ||
| 4523 | if (ilb_cpu >= nr_cpu_ids) | ||
| 4524 | return; | ||
| 4525 | } | ||
| 4526 | 4852 | ||
| 4527 | if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { | 4853 | if (ilb_cpu >= nr_cpu_ids) |
| 4528 | cpu_rq(ilb_cpu)->nohz_balance_kick = 1; | 4854 | return; |
| 4529 | 4855 | ||
| 4530 | smp_mb(); | 4856 | if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) |
| 4531 | /* | 4857 | return; |
| 4532 | * Use smp_send_reschedule() instead of resched_cpu(). | 4858 | /* |
| 4533 | * This way we generate a sched IPI on the target cpu which | 4859 | * Use smp_send_reschedule() instead of resched_cpu(). |
| 4534 | * is idle. And the softirq performing nohz idle load balance | 4860 | * This way we generate a sched IPI on the target cpu which |
| 4535 | * will be run before returning from the IPI. | 4861 | * is idle. And the softirq performing nohz idle load balance |
| 4536 | */ | 4862 | * will be run before returning from the IPI. |
| 4537 | smp_send_reschedule(ilb_cpu); | 4863 | */ |
| 4538 | } | 4864 | smp_send_reschedule(ilb_cpu); |
| 4539 | return; | 4865 | return; |
| 4540 | } | 4866 | } |
| 4541 | 4867 | ||
| 4542 | /* | 4868 | static inline void set_cpu_sd_state_busy(void) |
| 4543 | * This routine will try to nominate the ilb (idle load balancing) | ||
| 4544 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | ||
| 4545 | * load balancing on behalf of all those cpus. | ||
| 4546 | * | ||
| 4547 | * When the ilb owner becomes busy, we will not have new ilb owner until some | ||
| 4548 | * idle CPU wakes up and goes back to idle or some busy CPU tries to kick | ||
| 4549 | * idle load balancing by kicking one of the idle CPUs. | ||
| 4550 | * | ||
| 4551 | * Ticks are stopped for the ilb owner as well, with busy CPU kicking this | ||
| 4552 | * ilb owner CPU in future (when there is a need for idle load balancing on | ||
| 4553 | * behalf of all idle CPUs). | ||
| 4554 | */ | ||
| 4555 | void select_nohz_load_balancer(int stop_tick) | ||
| 4556 | { | 4869 | { |
| 4870 | struct sched_domain *sd; | ||
| 4557 | int cpu = smp_processor_id(); | 4871 | int cpu = smp_processor_id(); |
| 4558 | 4872 | ||
| 4559 | if (stop_tick) { | 4873 | if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) |
| 4560 | if (!cpu_active(cpu)) { | 4874 | return; |
| 4561 | if (atomic_read(&nohz.load_balancer) != cpu) | 4875 | clear_bit(NOHZ_IDLE, nohz_flags(cpu)); |
| 4562 | return; | ||
| 4563 | |||
| 4564 | /* | ||
| 4565 | * If we are going offline and still the leader, | ||
| 4566 | * give up! | ||
| 4567 | */ | ||
| 4568 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, | ||
| 4569 | nr_cpu_ids) != cpu) | ||
| 4570 | BUG(); | ||
| 4571 | 4876 | ||
| 4572 | return; | 4877 | rcu_read_lock(); |
| 4573 | } | 4878 | for_each_domain(cpu, sd) |
| 4879 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); | ||
| 4880 | rcu_read_unlock(); | ||
| 4881 | } | ||
| 4574 | 4882 | ||
| 4575 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); | 4883 | void set_cpu_sd_state_idle(void) |
| 4884 | { | ||
| 4885 | struct sched_domain *sd; | ||
| 4886 | int cpu = smp_processor_id(); | ||
| 4576 | 4887 | ||
| 4577 | if (atomic_read(&nohz.first_pick_cpu) == cpu) | 4888 | if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) |
| 4578 | atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); | 4889 | return; |
| 4579 | if (atomic_read(&nohz.second_pick_cpu) == cpu) | 4890 | set_bit(NOHZ_IDLE, nohz_flags(cpu)); |
| 4580 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); | ||
| 4581 | 4891 | ||
| 4582 | if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { | 4892 | rcu_read_lock(); |
| 4583 | int new_ilb; | 4893 | for_each_domain(cpu, sd) |
| 4894 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); | ||
| 4895 | rcu_read_unlock(); | ||
| 4896 | } | ||
| 4584 | 4897 | ||
| 4585 | /* make me the ilb owner */ | 4898 | /* |
| 4586 | if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, | 4899 | * This routine will record that this cpu is going idle with tick stopped. |
| 4587 | cpu) != nr_cpu_ids) | 4900 | * This info will be used in performing idle load balancing in the future. |
| 4588 | return; | 4901 | */ |
| 4902 | void select_nohz_load_balancer(int stop_tick) | ||
| 4903 | { | ||
| 4904 | int cpu = smp_processor_id(); | ||
| 4589 | 4905 | ||
| 4590 | /* | 4906 | if (stop_tick) { |
| 4591 | * Check to see if there is a more power-efficient | 4907 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) |
| 4592 | * ilb. | ||
| 4593 | */ | ||
| 4594 | new_ilb = find_new_ilb(cpu); | ||
| 4595 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | ||
| 4596 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | ||
| 4597 | resched_cpu(new_ilb); | ||
| 4598 | return; | ||
| 4599 | } | ||
| 4600 | return; | ||
| 4601 | } | ||
| 4602 | } else { | ||
| 4603 | if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) | ||
| 4604 | return; | 4908 | return; |
| 4605 | 4909 | ||
| 4606 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | 4910 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); |
| 4607 | 4911 | atomic_inc(&nohz.nr_cpus); | |
| 4608 | if (atomic_read(&nohz.load_balancer) == cpu) | 4912 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); |
| 4609 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, | ||
| 4610 | nr_cpu_ids) != cpu) | ||
| 4611 | BUG(); | ||
| 4612 | } | 4913 | } |
| 4613 | return; | 4914 | return; |
| 4614 | } | 4915 | } |
| @@ -4622,7 +4923,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; | |||
| 4622 | * Scale the max load_balance interval with the number of CPUs in the system. | 4923 | * Scale the max load_balance interval with the number of CPUs in the system. |
| 4623 | * This trades load-balance latency on larger machines for less cross talk. | 4924 | * This trades load-balance latency on larger machines for less cross talk. |
| 4624 | */ | 4925 | */ |
| 4625 | static void update_max_interval(void) | 4926 | void update_max_interval(void) |
| 4626 | { | 4927 | { |
| 4627 | max_load_balance_interval = HZ*num_online_cpus()/10; | 4928 | max_load_balance_interval = HZ*num_online_cpus()/10; |
| 4628 | } | 4929 | } |
| @@ -4714,11 +5015,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
| 4714 | struct rq *rq; | 5015 | struct rq *rq; |
| 4715 | int balance_cpu; | 5016 | int balance_cpu; |
| 4716 | 5017 | ||
| 4717 | if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) | 5018 | if (idle != CPU_IDLE || |
| 4718 | return; | 5019 | !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) |
| 5020 | goto end; | ||
| 4719 | 5021 | ||
| 4720 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | 5022 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { |
| 4721 | if (balance_cpu == this_cpu) | 5023 | if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) |
| 4722 | continue; | 5024 | continue; |
| 4723 | 5025 | ||
| 4724 | /* | 5026 | /* |
| @@ -4726,10 +5028,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
| 4726 | * work being done for other cpus. Next load | 5028 | * work being done for other cpus. Next load |
| 4727 | * balancing owner will pick it up. | 5029 | * balancing owner will pick it up. |
| 4728 | */ | 5030 | */ |
| 4729 | if (need_resched()) { | 5031 | if (need_resched()) |
| 4730 | this_rq->nohz_balance_kick = 0; | ||
| 4731 | break; | 5032 | break; |
| 4732 | } | ||
| 4733 | 5033 | ||
| 4734 | raw_spin_lock_irq(&this_rq->lock); | 5034 | raw_spin_lock_irq(&this_rq->lock); |
| 4735 | update_rq_clock(this_rq); | 5035 | update_rq_clock(this_rq); |
| @@ -4743,53 +5043,75 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
| 4743 | this_rq->next_balance = rq->next_balance; | 5043 | this_rq->next_balance = rq->next_balance; |
| 4744 | } | 5044 | } |
| 4745 | nohz.next_balance = this_rq->next_balance; | 5045 | nohz.next_balance = this_rq->next_balance; |
| 4746 | this_rq->nohz_balance_kick = 0; | 5046 | end: |
| 5047 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); | ||
| 4747 | } | 5048 | } |
| 4748 | 5049 | ||
| 4749 | /* | 5050 | /* |
| 4750 | * Current heuristic for kicking the idle load balancer | 5051 | * Current heuristic for kicking the idle load balancer in the presence |
| 4751 | * - first_pick_cpu is the one of the busy CPUs. It will kick | 5052 | * of an idle cpu is the system. |
| 4752 | * idle load balancer when it has more than one process active. This | 5053 | * - This rq has more than one task. |
| 4753 | * eliminates the need for idle load balancing altogether when we have | 5054 | * - At any scheduler domain level, this cpu's scheduler group has multiple |
| 4754 | * only one running process in the system (common case). | 5055 | * busy cpu's exceeding the group's power. |
| 4755 | * - If there are more than one busy CPU, idle load balancer may have | 5056 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
| 4756 | * to run for active_load_balance to happen (i.e., two busy CPUs are | 5057 | * domain span are idle. |
| 4757 | * SMT or core siblings and can run better if they move to different | ||
| 4758 | * physical CPUs). So, second_pick_cpu is the second of the busy CPUs | ||
| 4759 | * which will kick idle load balancer as soon as it has any load. | ||
| 4760 | */ | 5058 | */ |
| 4761 | static inline int nohz_kick_needed(struct rq *rq, int cpu) | 5059 | static inline int nohz_kick_needed(struct rq *rq, int cpu) |
| 4762 | { | 5060 | { |
| 4763 | unsigned long now = jiffies; | 5061 | unsigned long now = jiffies; |
| 4764 | int ret; | 5062 | struct sched_domain *sd; |
| 4765 | int first_pick_cpu, second_pick_cpu; | ||
| 4766 | 5063 | ||
| 4767 | if (time_before(now, nohz.next_balance)) | 5064 | if (unlikely(idle_cpu(cpu))) |
| 4768 | return 0; | 5065 | return 0; |
| 4769 | 5066 | ||
| 4770 | if (idle_cpu(cpu)) | 5067 | /* |
| 4771 | return 0; | 5068 | * We may be recently in ticked or tickless idle mode. At the first |
| 5069 | * busy tick after returning from idle, we will update the busy stats. | ||
| 5070 | */ | ||
| 5071 | set_cpu_sd_state_busy(); | ||
| 5072 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { | ||
| 5073 | clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | ||
| 5074 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | ||
| 5075 | atomic_dec(&nohz.nr_cpus); | ||
| 5076 | } | ||
| 4772 | 5077 | ||
| 4773 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); | 5078 | /* |
| 4774 | second_pick_cpu = atomic_read(&nohz.second_pick_cpu); | 5079 | * None are in tickless mode and hence no need for NOHZ idle load |
| 5080 | * balancing. | ||
| 5081 | */ | ||
| 5082 | if (likely(!atomic_read(&nohz.nr_cpus))) | ||
| 5083 | return 0; | ||
| 4775 | 5084 | ||
| 4776 | if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && | 5085 | if (time_before(now, nohz.next_balance)) |
| 4777 | second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) | ||
| 4778 | return 0; | 5086 | return 0; |
| 4779 | 5087 | ||
| 4780 | ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); | 5088 | if (rq->nr_running >= 2) |
| 4781 | if (ret == nr_cpu_ids || ret == cpu) { | 5089 | goto need_kick; |
| 4782 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); | 5090 | |
| 4783 | if (rq->nr_running > 1) | 5091 | rcu_read_lock(); |
| 4784 | return 1; | 5092 | for_each_domain(cpu, sd) { |
| 4785 | } else { | 5093 | struct sched_group *sg = sd->groups; |
| 4786 | ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); | 5094 | struct sched_group_power *sgp = sg->sgp; |
| 4787 | if (ret == nr_cpu_ids || ret == cpu) { | 5095 | int nr_busy = atomic_read(&sgp->nr_busy_cpus); |
| 4788 | if (rq->nr_running) | 5096 | |
| 4789 | return 1; | 5097 | if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) |
| 4790 | } | 5098 | goto need_kick_unlock; |
| 5099 | |||
| 5100 | if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight | ||
| 5101 | && (cpumask_first_and(nohz.idle_cpus_mask, | ||
| 5102 | sched_domain_span(sd)) < cpu)) | ||
| 5103 | goto need_kick_unlock; | ||
| 5104 | |||
| 5105 | if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) | ||
| 5106 | break; | ||
| 4791 | } | 5107 | } |
| 5108 | rcu_read_unlock(); | ||
| 4792 | return 0; | 5109 | return 0; |
| 5110 | |||
| 5111 | need_kick_unlock: | ||
| 5112 | rcu_read_unlock(); | ||
| 5113 | need_kick: | ||
| 5114 | return 1; | ||
| 4793 | } | 5115 | } |
| 4794 | #else | 5116 | #else |
| 4795 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | 5117 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } |
| @@ -4824,14 +5146,14 @@ static inline int on_null_domain(int cpu) | |||
| 4824 | /* | 5146 | /* |
| 4825 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 5147 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
| 4826 | */ | 5148 | */ |
| 4827 | static inline void trigger_load_balance(struct rq *rq, int cpu) | 5149 | void trigger_load_balance(struct rq *rq, int cpu) |
| 4828 | { | 5150 | { |
| 4829 | /* Don't need to rebalance while attached to NULL domain */ | 5151 | /* Don't need to rebalance while attached to NULL domain */ |
| 4830 | if (time_after_eq(jiffies, rq->next_balance) && | 5152 | if (time_after_eq(jiffies, rq->next_balance) && |
| 4831 | likely(!on_null_domain(cpu))) | 5153 | likely(!on_null_domain(cpu))) |
| 4832 | raise_softirq(SCHED_SOFTIRQ); | 5154 | raise_softirq(SCHED_SOFTIRQ); |
| 4833 | #ifdef CONFIG_NO_HZ | 5155 | #ifdef CONFIG_NO_HZ |
| 4834 | else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | 5156 | if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) |
| 4835 | nohz_balancer_kick(cpu); | 5157 | nohz_balancer_kick(cpu); |
| 4836 | #endif | 5158 | #endif |
| 4837 | } | 5159 | } |
| @@ -4846,15 +5168,6 @@ static void rq_offline_fair(struct rq *rq) | |||
| 4846 | update_sysctl(); | 5168 | update_sysctl(); |
| 4847 | } | 5169 | } |
| 4848 | 5170 | ||
| 4849 | #else /* CONFIG_SMP */ | ||
| 4850 | |||
| 4851 | /* | ||
| 4852 | * on UP we do not need to balance between CPUs: | ||
| 4853 | */ | ||
| 4854 | static inline void idle_balance(int cpu, struct rq *rq) | ||
| 4855 | { | ||
| 4856 | } | ||
| 4857 | |||
| 4858 | #endif /* CONFIG_SMP */ | 5171 | #endif /* CONFIG_SMP */ |
| 4859 | 5172 | ||
| 4860 | /* | 5173 | /* |
| @@ -4997,6 +5310,16 @@ static void set_curr_task_fair(struct rq *rq) | |||
| 4997 | } | 5310 | } |
| 4998 | } | 5311 | } |
| 4999 | 5312 | ||
| 5313 | void init_cfs_rq(struct cfs_rq *cfs_rq) | ||
| 5314 | { | ||
| 5315 | cfs_rq->tasks_timeline = RB_ROOT; | ||
| 5316 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
| 5317 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | ||
| 5318 | #ifndef CONFIG_64BIT | ||
| 5319 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
| 5320 | #endif | ||
| 5321 | } | ||
| 5322 | |||
| 5000 | #ifdef CONFIG_FAIR_GROUP_SCHED | 5323 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 5001 | static void task_move_group_fair(struct task_struct *p, int on_rq) | 5324 | static void task_move_group_fair(struct task_struct *p, int on_rq) |
| 5002 | { | 5325 | { |
| @@ -5019,7 +5342,161 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
| 5019 | if (!on_rq) | 5342 | if (!on_rq) |
| 5020 | p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; | 5343 | p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; |
| 5021 | } | 5344 | } |
| 5345 | |||
| 5346 | void free_fair_sched_group(struct task_group *tg) | ||
| 5347 | { | ||
| 5348 | int i; | ||
| 5349 | |||
| 5350 | destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
| 5351 | |||
| 5352 | for_each_possible_cpu(i) { | ||
| 5353 | if (tg->cfs_rq) | ||
| 5354 | kfree(tg->cfs_rq[i]); | ||
| 5355 | if (tg->se) | ||
| 5356 | kfree(tg->se[i]); | ||
| 5357 | } | ||
| 5358 | |||
| 5359 | kfree(tg->cfs_rq); | ||
| 5360 | kfree(tg->se); | ||
| 5361 | } | ||
| 5362 | |||
| 5363 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
| 5364 | { | ||
| 5365 | struct cfs_rq *cfs_rq; | ||
| 5366 | struct sched_entity *se; | ||
| 5367 | int i; | ||
| 5368 | |||
| 5369 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | ||
| 5370 | if (!tg->cfs_rq) | ||
| 5371 | goto err; | ||
| 5372 | tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); | ||
| 5373 | if (!tg->se) | ||
| 5374 | goto err; | ||
| 5375 | |||
| 5376 | tg->shares = NICE_0_LOAD; | ||
| 5377 | |||
| 5378 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
| 5379 | |||
| 5380 | for_each_possible_cpu(i) { | ||
| 5381 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | ||
| 5382 | GFP_KERNEL, cpu_to_node(i)); | ||
| 5383 | if (!cfs_rq) | ||
| 5384 | goto err; | ||
| 5385 | |||
| 5386 | se = kzalloc_node(sizeof(struct sched_entity), | ||
| 5387 | GFP_KERNEL, cpu_to_node(i)); | ||
| 5388 | if (!se) | ||
| 5389 | goto err_free_rq; | ||
| 5390 | |||
| 5391 | init_cfs_rq(cfs_rq); | ||
| 5392 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | ||
| 5393 | } | ||
| 5394 | |||
| 5395 | return 1; | ||
| 5396 | |||
| 5397 | err_free_rq: | ||
| 5398 | kfree(cfs_rq); | ||
| 5399 | err: | ||
| 5400 | return 0; | ||
| 5401 | } | ||
| 5402 | |||
| 5403 | void unregister_fair_sched_group(struct task_group *tg, int cpu) | ||
| 5404 | { | ||
| 5405 | struct rq *rq = cpu_rq(cpu); | ||
| 5406 | unsigned long flags; | ||
| 5407 | |||
| 5408 | /* | ||
| 5409 | * Only empty task groups can be destroyed; so we can speculatively | ||
| 5410 | * check on_list without danger of it being re-added. | ||
| 5411 | */ | ||
| 5412 | if (!tg->cfs_rq[cpu]->on_list) | ||
| 5413 | return; | ||
| 5414 | |||
| 5415 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
| 5416 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
| 5417 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
| 5418 | } | ||
| 5419 | |||
| 5420 | void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | ||
| 5421 | struct sched_entity *se, int cpu, | ||
| 5422 | struct sched_entity *parent) | ||
| 5423 | { | ||
| 5424 | struct rq *rq = cpu_rq(cpu); | ||
| 5425 | |||
| 5426 | cfs_rq->tg = tg; | ||
| 5427 | cfs_rq->rq = rq; | ||
| 5428 | #ifdef CONFIG_SMP | ||
| 5429 | /* allow initial update_cfs_load() to truncate */ | ||
| 5430 | cfs_rq->load_stamp = 1; | ||
| 5022 | #endif | 5431 | #endif |
| 5432 | init_cfs_rq_runtime(cfs_rq); | ||
| 5433 | |||
| 5434 | tg->cfs_rq[cpu] = cfs_rq; | ||
| 5435 | tg->se[cpu] = se; | ||
| 5436 | |||
| 5437 | /* se could be NULL for root_task_group */ | ||
| 5438 | if (!se) | ||
| 5439 | return; | ||
| 5440 | |||
| 5441 | if (!parent) | ||
| 5442 | se->cfs_rq = &rq->cfs; | ||
| 5443 | else | ||
| 5444 | se->cfs_rq = parent->my_q; | ||
| 5445 | |||
| 5446 | se->my_q = cfs_rq; | ||
| 5447 | update_load_set(&se->load, 0); | ||
| 5448 | se->parent = parent; | ||
| 5449 | } | ||
| 5450 | |||
| 5451 | static DEFINE_MUTEX(shares_mutex); | ||
| 5452 | |||
| 5453 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | ||
| 5454 | { | ||
| 5455 | int i; | ||
| 5456 | unsigned long flags; | ||
| 5457 | |||
| 5458 | /* | ||
| 5459 | * We can't change the weight of the root cgroup. | ||
| 5460 | */ | ||
| 5461 | if (!tg->se[0]) | ||
| 5462 | return -EINVAL; | ||
| 5463 | |||
| 5464 | shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); | ||
| 5465 | |||
| 5466 | mutex_lock(&shares_mutex); | ||
| 5467 | if (tg->shares == shares) | ||
| 5468 | goto done; | ||
| 5469 | |||
| 5470 | tg->shares = shares; | ||
| 5471 | for_each_possible_cpu(i) { | ||
| 5472 | struct rq *rq = cpu_rq(i); | ||
| 5473 | struct sched_entity *se; | ||
| 5474 | |||
| 5475 | se = tg->se[i]; | ||
| 5476 | /* Propagate contribution to hierarchy */ | ||
| 5477 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
| 5478 | for_each_sched_entity(se) | ||
| 5479 | update_cfs_shares(group_cfs_rq(se)); | ||
| 5480 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
| 5481 | } | ||
| 5482 | |||
| 5483 | done: | ||
| 5484 | mutex_unlock(&shares_mutex); | ||
| 5485 | return 0; | ||
| 5486 | } | ||
| 5487 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 5488 | |||
| 5489 | void free_fair_sched_group(struct task_group *tg) { } | ||
| 5490 | |||
| 5491 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
| 5492 | { | ||
| 5493 | return 1; | ||
| 5494 | } | ||
| 5495 | |||
| 5496 | void unregister_fair_sched_group(struct task_group *tg, int cpu) { } | ||
| 5497 | |||
| 5498 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 5499 | |||
| 5023 | 5500 | ||
| 5024 | static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) | 5501 | static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) |
| 5025 | { | 5502 | { |
| @@ -5039,7 +5516,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task | |||
| 5039 | /* | 5516 | /* |
| 5040 | * All the scheduling class methods: | 5517 | * All the scheduling class methods: |
| 5041 | */ | 5518 | */ |
| 5042 | static const struct sched_class fair_sched_class = { | 5519 | const struct sched_class fair_sched_class = { |
| 5043 | .next = &idle_sched_class, | 5520 | .next = &idle_sched_class, |
| 5044 | .enqueue_task = enqueue_task_fair, | 5521 | .enqueue_task = enqueue_task_fair, |
| 5045 | .dequeue_task = dequeue_task_fair, | 5522 | .dequeue_task = dequeue_task_fair, |
| @@ -5076,7 +5553,7 @@ static const struct sched_class fair_sched_class = { | |||
| 5076 | }; | 5553 | }; |
| 5077 | 5554 | ||
| 5078 | #ifdef CONFIG_SCHED_DEBUG | 5555 | #ifdef CONFIG_SCHED_DEBUG |
| 5079 | static void print_cfs_stats(struct seq_file *m, int cpu) | 5556 | void print_cfs_stats(struct seq_file *m, int cpu) |
| 5080 | { | 5557 | { |
| 5081 | struct cfs_rq *cfs_rq; | 5558 | struct cfs_rq *cfs_rq; |
| 5082 | 5559 | ||
| @@ -5086,3 +5563,15 @@ static void print_cfs_stats(struct seq_file *m, int cpu) | |||
| 5086 | rcu_read_unlock(); | 5563 | rcu_read_unlock(); |
| 5087 | } | 5564 | } |
| 5088 | #endif | 5565 | #endif |
| 5566 | |||
| 5567 | __init void init_sched_fair_class(void) | ||
| 5568 | { | ||
| 5569 | #ifdef CONFIG_SMP | ||
| 5570 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); | ||
| 5571 | |||
| 5572 | #ifdef CONFIG_NO_HZ | ||
| 5573 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | ||
| 5574 | #endif | ||
| 5575 | #endif /* SMP */ | ||
| 5576 | |||
| 5577 | } | ||
diff --git a/kernel/sched_features.h b/kernel/sched/features.h index 84802245abd2..e61fd73913d0 100644 --- a/kernel/sched_features.h +++ b/kernel/sched/features.h | |||
| @@ -3,13 +3,13 @@ | |||
| 3 | * them to run sooner, but does not allow tons of sleepers to | 3 | * them to run sooner, but does not allow tons of sleepers to |
| 4 | * rip the spread apart. | 4 | * rip the spread apart. |
| 5 | */ | 5 | */ |
| 6 | SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) | 6 | SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) |
| 7 | 7 | ||
| 8 | /* | 8 | /* |
| 9 | * Place new tasks ahead so that they do not starve already running | 9 | * Place new tasks ahead so that they do not starve already running |
| 10 | * tasks | 10 | * tasks |
| 11 | */ | 11 | */ |
| 12 | SCHED_FEAT(START_DEBIT, 1) | 12 | SCHED_FEAT(START_DEBIT, true) |
| 13 | 13 | ||
| 14 | /* | 14 | /* |
| 15 | * Based on load and program behaviour, see if it makes sense to place | 15 | * Based on load and program behaviour, see if it makes sense to place |
| @@ -17,54 +17,54 @@ SCHED_FEAT(START_DEBIT, 1) | |||
| 17 | * improve cache locality. Typically used with SYNC wakeups as | 17 | * improve cache locality. Typically used with SYNC wakeups as |
| 18 | * generated by pipes and the like, see also SYNC_WAKEUPS. | 18 | * generated by pipes and the like, see also SYNC_WAKEUPS. |
| 19 | */ | 19 | */ |
| 20 | SCHED_FEAT(AFFINE_WAKEUPS, 1) | 20 | SCHED_FEAT(AFFINE_WAKEUPS, true) |
| 21 | 21 | ||
| 22 | /* | 22 | /* |
| 23 | * Prefer to schedule the task we woke last (assuming it failed | 23 | * Prefer to schedule the task we woke last (assuming it failed |
| 24 | * wakeup-preemption), since its likely going to consume data we | 24 | * wakeup-preemption), since its likely going to consume data we |
| 25 | * touched, increases cache locality. | 25 | * touched, increases cache locality. |
| 26 | */ | 26 | */ |
| 27 | SCHED_FEAT(NEXT_BUDDY, 0) | 27 | SCHED_FEAT(NEXT_BUDDY, false) |
| 28 | 28 | ||
| 29 | /* | 29 | /* |
| 30 | * Prefer to schedule the task that ran last (when we did | 30 | * Prefer to schedule the task that ran last (when we did |
| 31 | * wake-preempt) as that likely will touch the same data, increases | 31 | * wake-preempt) as that likely will touch the same data, increases |
| 32 | * cache locality. | 32 | * cache locality. |
| 33 | */ | 33 | */ |
| 34 | SCHED_FEAT(LAST_BUDDY, 1) | 34 | SCHED_FEAT(LAST_BUDDY, true) |
| 35 | 35 | ||
| 36 | /* | 36 | /* |
| 37 | * Consider buddies to be cache hot, decreases the likelyness of a | 37 | * Consider buddies to be cache hot, decreases the likelyness of a |
| 38 | * cache buddy being migrated away, increases cache locality. | 38 | * cache buddy being migrated away, increases cache locality. |
| 39 | */ | 39 | */ |
| 40 | SCHED_FEAT(CACHE_HOT_BUDDY, 1) | 40 | SCHED_FEAT(CACHE_HOT_BUDDY, true) |
| 41 | 41 | ||
| 42 | /* | 42 | /* |
| 43 | * Use arch dependent cpu power functions | 43 | * Use arch dependent cpu power functions |
| 44 | */ | 44 | */ |
| 45 | SCHED_FEAT(ARCH_POWER, 0) | 45 | SCHED_FEAT(ARCH_POWER, false) |
| 46 | 46 | ||
| 47 | SCHED_FEAT(HRTICK, 0) | 47 | SCHED_FEAT(HRTICK, false) |
| 48 | SCHED_FEAT(DOUBLE_TICK, 0) | 48 | SCHED_FEAT(DOUBLE_TICK, false) |
| 49 | SCHED_FEAT(LB_BIAS, 1) | 49 | SCHED_FEAT(LB_BIAS, true) |
| 50 | 50 | ||
| 51 | /* | 51 | /* |
| 52 | * Spin-wait on mutex acquisition when the mutex owner is running on | 52 | * Spin-wait on mutex acquisition when the mutex owner is running on |
| 53 | * another cpu -- assumes that when the owner is running, it will soon | 53 | * another cpu -- assumes that when the owner is running, it will soon |
| 54 | * release the lock. Decreases scheduling overhead. | 54 | * release the lock. Decreases scheduling overhead. |
| 55 | */ | 55 | */ |
| 56 | SCHED_FEAT(OWNER_SPIN, 1) | 56 | SCHED_FEAT(OWNER_SPIN, true) |
| 57 | 57 | ||
| 58 | /* | 58 | /* |
| 59 | * Decrement CPU power based on time not spent running tasks | 59 | * Decrement CPU power based on time not spent running tasks |
| 60 | */ | 60 | */ |
| 61 | SCHED_FEAT(NONTASK_POWER, 1) | 61 | SCHED_FEAT(NONTASK_POWER, true) |
| 62 | 62 | ||
| 63 | /* | 63 | /* |
| 64 | * Queue remote wakeups on the target CPU and process them | 64 | * Queue remote wakeups on the target CPU and process them |
| 65 | * using the scheduler IPI. Reduces rq->lock contention/bounces. | 65 | * using the scheduler IPI. Reduces rq->lock contention/bounces. |
| 66 | */ | 66 | */ |
| 67 | SCHED_FEAT(TTWU_QUEUE, 1) | 67 | SCHED_FEAT(TTWU_QUEUE, true) |
| 68 | 68 | ||
| 69 | SCHED_FEAT(FORCE_SD_OVERLAP, 0) | 69 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
| 70 | SCHED_FEAT(RT_RUNTIME_SHARE, 1) | 70 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
diff --git a/kernel/sched_idletask.c b/kernel/sched/idle_task.c index 0a51882534ea..91b4c957f289 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched/idle_task.c | |||
| @@ -1,3 +1,5 @@ | |||
| 1 | #include "sched.h" | ||
| 2 | |||
| 1 | /* | 3 | /* |
| 2 | * idle-task scheduling class. | 4 | * idle-task scheduling class. |
| 3 | * | 5 | * |
| @@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task | |||
| 71 | /* | 73 | /* |
| 72 | * Simple, special scheduling class for the per-CPU idle tasks: | 74 | * Simple, special scheduling class for the per-CPU idle tasks: |
| 73 | */ | 75 | */ |
| 74 | static const struct sched_class idle_sched_class = { | 76 | const struct sched_class idle_sched_class = { |
| 75 | /* .next is NULL */ | 77 | /* .next is NULL */ |
| 76 | /* no enqueue/yield_task for idle tasks */ | 78 | /* no enqueue/yield_task for idle tasks */ |
| 77 | 79 | ||
diff --git a/kernel/sched_rt.c b/kernel/sched/rt.c index 583a1368afe6..3640ebbb466b 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched/rt.c | |||
| @@ -3,7 +3,92 @@ | |||
| 3 | * policies) | 3 | * policies) |
| 4 | */ | 4 | */ |
| 5 | 5 | ||
| 6 | #include "sched.h" | ||
| 7 | |||
| 8 | #include <linux/slab.h> | ||
| 9 | |||
| 10 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); | ||
| 11 | |||
| 12 | struct rt_bandwidth def_rt_bandwidth; | ||
| 13 | |||
| 14 | static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) | ||
| 15 | { | ||
| 16 | struct rt_bandwidth *rt_b = | ||
| 17 | container_of(timer, struct rt_bandwidth, rt_period_timer); | ||
| 18 | ktime_t now; | ||
| 19 | int overrun; | ||
| 20 | int idle = 0; | ||
| 21 | |||
| 22 | for (;;) { | ||
| 23 | now = hrtimer_cb_get_time(timer); | ||
| 24 | overrun = hrtimer_forward(timer, now, rt_b->rt_period); | ||
| 25 | |||
| 26 | if (!overrun) | ||
| 27 | break; | ||
| 28 | |||
| 29 | idle = do_sched_rt_period_timer(rt_b, overrun); | ||
| 30 | } | ||
| 31 | |||
| 32 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
| 33 | } | ||
| 34 | |||
| 35 | void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | ||
| 36 | { | ||
| 37 | rt_b->rt_period = ns_to_ktime(period); | ||
| 38 | rt_b->rt_runtime = runtime; | ||
| 39 | |||
| 40 | raw_spin_lock_init(&rt_b->rt_runtime_lock); | ||
| 41 | |||
| 42 | hrtimer_init(&rt_b->rt_period_timer, | ||
| 43 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 44 | rt_b->rt_period_timer.function = sched_rt_period_timer; | ||
| 45 | } | ||
| 46 | |||
| 47 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
| 48 | { | ||
| 49 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | ||
| 50 | return; | ||
| 51 | |||
| 52 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
| 53 | return; | ||
| 54 | |||
| 55 | raw_spin_lock(&rt_b->rt_runtime_lock); | ||
| 56 | start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); | ||
| 57 | raw_spin_unlock(&rt_b->rt_runtime_lock); | ||
| 58 | } | ||
| 59 | |||
| 60 | void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | ||
| 61 | { | ||
| 62 | struct rt_prio_array *array; | ||
| 63 | int i; | ||
| 64 | |||
| 65 | array = &rt_rq->active; | ||
| 66 | for (i = 0; i < MAX_RT_PRIO; i++) { | ||
| 67 | INIT_LIST_HEAD(array->queue + i); | ||
| 68 | __clear_bit(i, array->bitmap); | ||
| 69 | } | ||
| 70 | /* delimiter for bitsearch: */ | ||
| 71 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
| 72 | |||
| 73 | #if defined CONFIG_SMP | ||
| 74 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | ||
| 75 | rt_rq->highest_prio.next = MAX_RT_PRIO; | ||
| 76 | rt_rq->rt_nr_migratory = 0; | ||
| 77 | rt_rq->overloaded = 0; | ||
| 78 | plist_head_init(&rt_rq->pushable_tasks); | ||
| 79 | #endif | ||
| 80 | |||
| 81 | rt_rq->rt_time = 0; | ||
| 82 | rt_rq->rt_throttled = 0; | ||
| 83 | rt_rq->rt_runtime = 0; | ||
| 84 | raw_spin_lock_init(&rt_rq->rt_runtime_lock); | ||
| 85 | } | ||
| 86 | |||
| 6 | #ifdef CONFIG_RT_GROUP_SCHED | 87 | #ifdef CONFIG_RT_GROUP_SCHED |
| 88 | static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
| 89 | { | ||
| 90 | hrtimer_cancel(&rt_b->rt_period_timer); | ||
| 91 | } | ||
| 7 | 92 | ||
| 8 | #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) | 93 | #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) |
| 9 | 94 | ||
| @@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | |||
| 25 | return rt_se->rt_rq; | 110 | return rt_se->rt_rq; |
| 26 | } | 111 | } |
| 27 | 112 | ||
| 113 | void free_rt_sched_group(struct task_group *tg) | ||
| 114 | { | ||
| 115 | int i; | ||
| 116 | |||
| 117 | if (tg->rt_se) | ||
| 118 | destroy_rt_bandwidth(&tg->rt_bandwidth); | ||
| 119 | |||
| 120 | for_each_possible_cpu(i) { | ||
| 121 | if (tg->rt_rq) | ||
| 122 | kfree(tg->rt_rq[i]); | ||
| 123 | if (tg->rt_se) | ||
| 124 | kfree(tg->rt_se[i]); | ||
| 125 | } | ||
| 126 | |||
| 127 | kfree(tg->rt_rq); | ||
| 128 | kfree(tg->rt_se); | ||
| 129 | } | ||
| 130 | |||
| 131 | void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | ||
| 132 | struct sched_rt_entity *rt_se, int cpu, | ||
| 133 | struct sched_rt_entity *parent) | ||
| 134 | { | ||
| 135 | struct rq *rq = cpu_rq(cpu); | ||
| 136 | |||
| 137 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | ||
| 138 | rt_rq->rt_nr_boosted = 0; | ||
| 139 | rt_rq->rq = rq; | ||
| 140 | rt_rq->tg = tg; | ||
| 141 | |||
| 142 | tg->rt_rq[cpu] = rt_rq; | ||
| 143 | tg->rt_se[cpu] = rt_se; | ||
| 144 | |||
| 145 | if (!rt_se) | ||
| 146 | return; | ||
| 147 | |||
| 148 | if (!parent) | ||
| 149 | rt_se->rt_rq = &rq->rt; | ||
| 150 | else | ||
| 151 | rt_se->rt_rq = parent->my_q; | ||
| 152 | |||
| 153 | rt_se->my_q = rt_rq; | ||
| 154 | rt_se->parent = parent; | ||
| 155 | INIT_LIST_HEAD(&rt_se->run_list); | ||
| 156 | } | ||
| 157 | |||
| 158 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
| 159 | { | ||
| 160 | struct rt_rq *rt_rq; | ||
| 161 | struct sched_rt_entity *rt_se; | ||
| 162 | int i; | ||
| 163 | |||
| 164 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | ||
| 165 | if (!tg->rt_rq) | ||
| 166 | goto err; | ||
| 167 | tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); | ||
| 168 | if (!tg->rt_se) | ||
| 169 | goto err; | ||
| 170 | |||
| 171 | init_rt_bandwidth(&tg->rt_bandwidth, | ||
| 172 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | ||
| 173 | |||
| 174 | for_each_possible_cpu(i) { | ||
| 175 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | ||
| 176 | GFP_KERNEL, cpu_to_node(i)); | ||
| 177 | if (!rt_rq) | ||
| 178 | goto err; | ||
| 179 | |||
| 180 | rt_se = kzalloc_node(sizeof(struct sched_rt_entity), | ||
| 181 | GFP_KERNEL, cpu_to_node(i)); | ||
| 182 | if (!rt_se) | ||
| 183 | goto err_free_rq; | ||
| 184 | |||
| 185 | init_rt_rq(rt_rq, cpu_rq(i)); | ||
| 186 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
| 187 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | ||
| 188 | } | ||
| 189 | |||
| 190 | return 1; | ||
| 191 | |||
| 192 | err_free_rq: | ||
| 193 | kfree(rt_rq); | ||
| 194 | err: | ||
| 195 | return 0; | ||
| 196 | } | ||
| 197 | |||
| 28 | #else /* CONFIG_RT_GROUP_SCHED */ | 198 | #else /* CONFIG_RT_GROUP_SCHED */ |
| 29 | 199 | ||
| 30 | #define rt_entity_is_task(rt_se) (1) | 200 | #define rt_entity_is_task(rt_se) (1) |
| @@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | |||
| 47 | return &rq->rt; | 217 | return &rq->rt; |
| 48 | } | 218 | } |
| 49 | 219 | ||
| 220 | void free_rt_sched_group(struct task_group *tg) { } | ||
| 221 | |||
| 222 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
| 223 | { | ||
| 224 | return 1; | ||
| 225 | } | ||
| 50 | #endif /* CONFIG_RT_GROUP_SCHED */ | 226 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 51 | 227 | ||
| 52 | #ifdef CONFIG_SMP | 228 | #ifdef CONFIG_SMP |
| @@ -556,6 +732,28 @@ static void enable_runtime(struct rq *rq) | |||
| 556 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 732 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
| 557 | } | 733 | } |
| 558 | 734 | ||
| 735 | int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
| 736 | { | ||
| 737 | int cpu = (int)(long)hcpu; | ||
| 738 | |||
| 739 | switch (action) { | ||
| 740 | case CPU_DOWN_PREPARE: | ||
| 741 | case CPU_DOWN_PREPARE_FROZEN: | ||
| 742 | disable_runtime(cpu_rq(cpu)); | ||
| 743 | return NOTIFY_OK; | ||
| 744 | |||
| 745 | case CPU_DOWN_FAILED: | ||
| 746 | case CPU_DOWN_FAILED_FROZEN: | ||
| 747 | case CPU_ONLINE: | ||
| 748 | case CPU_ONLINE_FROZEN: | ||
| 749 | enable_runtime(cpu_rq(cpu)); | ||
| 750 | return NOTIFY_OK; | ||
| 751 | |||
| 752 | default: | ||
| 753 | return NOTIFY_DONE; | ||
| 754 | } | ||
| 755 | } | ||
| 756 | |||
| 559 | static int balance_runtime(struct rt_rq *rt_rq) | 757 | static int balance_runtime(struct rt_rq *rt_rq) |
| 560 | { | 758 | { |
| 561 | int more = 0; | 759 | int more = 0; |
| @@ -648,7 +846,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
| 648 | if (rt_rq->rt_throttled) | 846 | if (rt_rq->rt_throttled) |
| 649 | return rt_rq_throttled(rt_rq); | 847 | return rt_rq_throttled(rt_rq); |
| 650 | 848 | ||
| 651 | if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) | 849 | if (runtime >= sched_rt_period(rt_rq)) |
| 652 | return 0; | 850 | return 0; |
| 653 | 851 | ||
| 654 | balance_runtime(rt_rq); | 852 | balance_runtime(rt_rq); |
| @@ -957,8 +1155,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
| 957 | } | 1155 | } |
| 958 | 1156 | ||
| 959 | /* | 1157 | /* |
| 960 | * Put task to the end of the run list without the overhead of dequeue | 1158 | * Put task to the head or the end of the run list without the overhead of |
| 961 | * followed by enqueue. | 1159 | * dequeue followed by enqueue. |
| 962 | */ | 1160 | */ |
| 963 | static void | 1161 | static void |
| 964 | requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) | 1162 | requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) |
| @@ -1002,6 +1200,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
| 1002 | 1200 | ||
| 1003 | cpu = task_cpu(p); | 1201 | cpu = task_cpu(p); |
| 1004 | 1202 | ||
| 1203 | if (p->rt.nr_cpus_allowed == 1) | ||
| 1204 | goto out; | ||
| 1205 | |||
| 1005 | /* For anything but wake ups, just return the task_cpu */ | 1206 | /* For anything but wake ups, just return the task_cpu */ |
| 1006 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | 1207 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) |
| 1007 | goto out; | 1208 | goto out; |
| @@ -1178,8 +1379,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
| 1178 | /* Only try algorithms three times */ | 1379 | /* Only try algorithms three times */ |
| 1179 | #define RT_MAX_TRIES 3 | 1380 | #define RT_MAX_TRIES 3 |
| 1180 | 1381 | ||
| 1181 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); | ||
| 1182 | |||
| 1183 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | 1382 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) |
| 1184 | { | 1383 | { |
| 1185 | if (!task_running(rq, p) && | 1384 | if (!task_running(rq, p) && |
| @@ -1653,13 +1852,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
| 1653 | pull_rt_task(rq); | 1852 | pull_rt_task(rq); |
| 1654 | } | 1853 | } |
| 1655 | 1854 | ||
| 1656 | static inline void init_sched_rt_class(void) | 1855 | void init_sched_rt_class(void) |
| 1657 | { | 1856 | { |
| 1658 | unsigned int i; | 1857 | unsigned int i; |
| 1659 | 1858 | ||
| 1660 | for_each_possible_cpu(i) | 1859 | for_each_possible_cpu(i) { |
| 1661 | zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), | 1860 | zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), |
| 1662 | GFP_KERNEL, cpu_to_node(i)); | 1861 | GFP_KERNEL, cpu_to_node(i)); |
| 1862 | } | ||
| 1663 | } | 1863 | } |
| 1664 | #endif /* CONFIG_SMP */ | 1864 | #endif /* CONFIG_SMP */ |
| 1665 | 1865 | ||
| @@ -1800,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) | |||
| 1800 | return 0; | 2000 | return 0; |
| 1801 | } | 2001 | } |
| 1802 | 2002 | ||
| 1803 | static const struct sched_class rt_sched_class = { | 2003 | const struct sched_class rt_sched_class = { |
| 1804 | .next = &fair_sched_class, | 2004 | .next = &fair_sched_class, |
| 1805 | .enqueue_task = enqueue_task_rt, | 2005 | .enqueue_task = enqueue_task_rt, |
| 1806 | .dequeue_task = dequeue_task_rt, | 2006 | .dequeue_task = dequeue_task_rt, |
| @@ -1835,7 +2035,7 @@ static const struct sched_class rt_sched_class = { | |||
| 1835 | #ifdef CONFIG_SCHED_DEBUG | 2035 | #ifdef CONFIG_SCHED_DEBUG |
| 1836 | extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); | 2036 | extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); |
| 1837 | 2037 | ||
| 1838 | static void print_rt_stats(struct seq_file *m, int cpu) | 2038 | void print_rt_stats(struct seq_file *m, int cpu) |
| 1839 | { | 2039 | { |
| 1840 | rt_rq_iter_t iter; | 2040 | rt_rq_iter_t iter; |
| 1841 | struct rt_rq *rt_rq; | 2041 | struct rt_rq *rt_rq; |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h new file mode 100644 index 000000000000..d8d3613a4055 --- /dev/null +++ b/kernel/sched/sched.h | |||
| @@ -0,0 +1,1136 @@ | |||
| 1 | |||
| 2 | #include <linux/sched.h> | ||
| 3 | #include <linux/mutex.h> | ||
| 4 | #include <linux/spinlock.h> | ||
| 5 | #include <linux/stop_machine.h> | ||
| 6 | |||
| 7 | #include "cpupri.h" | ||
| 8 | |||
| 9 | extern __read_mostly int scheduler_running; | ||
| 10 | |||
| 11 | /* | ||
| 12 | * Convert user-nice values [ -20 ... 0 ... 19 ] | ||
| 13 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | ||
| 14 | * and back. | ||
| 15 | */ | ||
| 16 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | ||
| 17 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | ||
| 18 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | ||
| 19 | |||
| 20 | /* | ||
| 21 | * 'User priority' is the nice value converted to something we | ||
| 22 | * can work with better when scaling various scheduler parameters, | ||
| 23 | * it's a [ 0 ... 39 ] range. | ||
| 24 | */ | ||
| 25 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | ||
| 26 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | ||
| 27 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | ||
| 28 | |||
| 29 | /* | ||
| 30 | * Helpers for converting nanosecond timing to jiffy resolution | ||
| 31 | */ | ||
| 32 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | ||
| 33 | |||
| 34 | #define NICE_0_LOAD SCHED_LOAD_SCALE | ||
| 35 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | ||
| 36 | |||
| 37 | /* | ||
| 38 | * These are the 'tuning knobs' of the scheduler: | ||
| 39 | * | ||
| 40 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). | ||
| 41 | * Timeslices get refilled after they expire. | ||
| 42 | */ | ||
| 43 | #define DEF_TIMESLICE (100 * HZ / 1000) | ||
| 44 | |||
| 45 | /* | ||
| 46 | * single value that denotes runtime == period, ie unlimited time. | ||
| 47 | */ | ||
| 48 | #define RUNTIME_INF ((u64)~0ULL) | ||
| 49 | |||
| 50 | static inline int rt_policy(int policy) | ||
| 51 | { | ||
| 52 | if (policy == SCHED_FIFO || policy == SCHED_RR) | ||
| 53 | return 1; | ||
| 54 | return 0; | ||
| 55 | } | ||
| 56 | |||
| 57 | static inline int task_has_rt_policy(struct task_struct *p) | ||
| 58 | { | ||
| 59 | return rt_policy(p->policy); | ||
| 60 | } | ||
| 61 | |||
| 62 | /* | ||
| 63 | * This is the priority-queue data structure of the RT scheduling class: | ||
| 64 | */ | ||
| 65 | struct rt_prio_array { | ||
| 66 | DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ | ||
| 67 | struct list_head queue[MAX_RT_PRIO]; | ||
| 68 | }; | ||
| 69 | |||
| 70 | struct rt_bandwidth { | ||
| 71 | /* nests inside the rq lock: */ | ||
| 72 | raw_spinlock_t rt_runtime_lock; | ||
| 73 | ktime_t rt_period; | ||
| 74 | u64 rt_runtime; | ||
| 75 | struct hrtimer rt_period_timer; | ||
| 76 | }; | ||
| 77 | |||
| 78 | extern struct mutex sched_domains_mutex; | ||
| 79 | |||
| 80 | #ifdef CONFIG_CGROUP_SCHED | ||
| 81 | |||
| 82 | #include <linux/cgroup.h> | ||
| 83 | |||
| 84 | struct cfs_rq; | ||
| 85 | struct rt_rq; | ||
| 86 | |||
| 87 | static LIST_HEAD(task_groups); | ||
| 88 | |||
| 89 | struct cfs_bandwidth { | ||
| 90 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 91 | raw_spinlock_t lock; | ||
| 92 | ktime_t period; | ||
| 93 | u64 quota, runtime; | ||
| 94 | s64 hierarchal_quota; | ||
| 95 | u64 runtime_expires; | ||
| 96 | |||
| 97 | int idle, timer_active; | ||
| 98 | struct hrtimer period_timer, slack_timer; | ||
| 99 | struct list_head throttled_cfs_rq; | ||
| 100 | |||
| 101 | /* statistics */ | ||
| 102 | int nr_periods, nr_throttled; | ||
| 103 | u64 throttled_time; | ||
| 104 | #endif | ||
| 105 | }; | ||
| 106 | |||
| 107 | /* task group related information */ | ||
| 108 | struct task_group { | ||
| 109 | struct cgroup_subsys_state css; | ||
| 110 | |||
| 111 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 112 | /* schedulable entities of this group on each cpu */ | ||
| 113 | struct sched_entity **se; | ||
| 114 | /* runqueue "owned" by this group on each cpu */ | ||
| 115 | struct cfs_rq **cfs_rq; | ||
| 116 | unsigned long shares; | ||
| 117 | |||
| 118 | atomic_t load_weight; | ||
| 119 | #endif | ||
| 120 | |||
| 121 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 122 | struct sched_rt_entity **rt_se; | ||
| 123 | struct rt_rq **rt_rq; | ||
| 124 | |||
| 125 | struct rt_bandwidth rt_bandwidth; | ||
| 126 | #endif | ||
| 127 | |||
| 128 | struct rcu_head rcu; | ||
| 129 | struct list_head list; | ||
| 130 | |||
| 131 | struct task_group *parent; | ||
| 132 | struct list_head siblings; | ||
| 133 | struct list_head children; | ||
| 134 | |||
| 135 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
| 136 | struct autogroup *autogroup; | ||
| 137 | #endif | ||
| 138 | |||
| 139 | struct cfs_bandwidth cfs_bandwidth; | ||
| 140 | }; | ||
| 141 | |||
| 142 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 143 | #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD | ||
| 144 | |||
| 145 | /* | ||
| 146 | * A weight of 0 or 1 can cause arithmetics problems. | ||
| 147 | * A weight of a cfs_rq is the sum of weights of which entities | ||
| 148 | * are queued on this cfs_rq, so a weight of a entity should not be | ||
| 149 | * too large, so as the shares value of a task group. | ||
| 150 | * (The default weight is 1024 - so there's no practical | ||
| 151 | * limitation from this.) | ||
| 152 | */ | ||
| 153 | #define MIN_SHARES (1UL << 1) | ||
| 154 | #define MAX_SHARES (1UL << 18) | ||
| 155 | #endif | ||
| 156 | |||
| 157 | /* Default task group. | ||
| 158 | * Every task in system belong to this group at bootup. | ||
| 159 | */ | ||
| 160 | extern struct task_group root_task_group; | ||
| 161 | |||
| 162 | typedef int (*tg_visitor)(struct task_group *, void *); | ||
| 163 | |||
| 164 | extern int walk_tg_tree_from(struct task_group *from, | ||
| 165 | tg_visitor down, tg_visitor up, void *data); | ||
| 166 | |||
| 167 | /* | ||
| 168 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
| 169 | * leaving it for the final time. | ||
| 170 | * | ||
| 171 | * Caller must hold rcu_lock or sufficient equivalent. | ||
| 172 | */ | ||
| 173 | static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | ||
| 174 | { | ||
| 175 | return walk_tg_tree_from(&root_task_group, down, up, data); | ||
| 176 | } | ||
| 177 | |||
| 178 | extern int tg_nop(struct task_group *tg, void *data); | ||
| 179 | |||
| 180 | extern void free_fair_sched_group(struct task_group *tg); | ||
| 181 | extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); | ||
| 182 | extern void unregister_fair_sched_group(struct task_group *tg, int cpu); | ||
| 183 | extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | ||
| 184 | struct sched_entity *se, int cpu, | ||
| 185 | struct sched_entity *parent); | ||
| 186 | extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); | ||
| 187 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); | ||
| 188 | |||
| 189 | extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); | ||
| 190 | extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); | ||
| 191 | extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); | ||
| 192 | |||
| 193 | extern void free_rt_sched_group(struct task_group *tg); | ||
| 194 | extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); | ||
| 195 | extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | ||
| 196 | struct sched_rt_entity *rt_se, int cpu, | ||
| 197 | struct sched_rt_entity *parent); | ||
| 198 | |||
| 199 | #else /* CONFIG_CGROUP_SCHED */ | ||
| 200 | |||
| 201 | struct cfs_bandwidth { }; | ||
| 202 | |||
| 203 | #endif /* CONFIG_CGROUP_SCHED */ | ||
| 204 | |||
| 205 | /* CFS-related fields in a runqueue */ | ||
| 206 | struct cfs_rq { | ||
| 207 | struct load_weight load; | ||
| 208 | unsigned long nr_running, h_nr_running; | ||
| 209 | |||
| 210 | u64 exec_clock; | ||
| 211 | u64 min_vruntime; | ||
| 212 | #ifndef CONFIG_64BIT | ||
| 213 | u64 min_vruntime_copy; | ||
| 214 | #endif | ||
| 215 | |||
| 216 | struct rb_root tasks_timeline; | ||
| 217 | struct rb_node *rb_leftmost; | ||
| 218 | |||
| 219 | struct list_head tasks; | ||
| 220 | struct list_head *balance_iterator; | ||
| 221 | |||
| 222 | /* | ||
| 223 | * 'curr' points to currently running entity on this cfs_rq. | ||
| 224 | * It is set to NULL otherwise (i.e when none are currently running). | ||
| 225 | */ | ||
| 226 | struct sched_entity *curr, *next, *last, *skip; | ||
| 227 | |||
| 228 | #ifdef CONFIG_SCHED_DEBUG | ||
| 229 | unsigned int nr_spread_over; | ||
| 230 | #endif | ||
| 231 | |||
| 232 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 233 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | ||
| 234 | |||
| 235 | /* | ||
| 236 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | ||
| 237 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | ||
| 238 | * (like users, containers etc.) | ||
| 239 | * | ||
| 240 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | ||
| 241 | * list is used during load balance. | ||
| 242 | */ | ||
| 243 | int on_list; | ||
| 244 | struct list_head leaf_cfs_rq_list; | ||
| 245 | struct task_group *tg; /* group that "owns" this runqueue */ | ||
| 246 | |||
| 247 | #ifdef CONFIG_SMP | ||
| 248 | /* | ||
| 249 | * the part of load.weight contributed by tasks | ||
| 250 | */ | ||
| 251 | unsigned long task_weight; | ||
| 252 | |||
| 253 | /* | ||
| 254 | * h_load = weight * f(tg) | ||
| 255 | * | ||
| 256 | * Where f(tg) is the recursive weight fraction assigned to | ||
| 257 | * this group. | ||
| 258 | */ | ||
| 259 | unsigned long h_load; | ||
| 260 | |||
| 261 | /* | ||
| 262 | * Maintaining per-cpu shares distribution for group scheduling | ||
| 263 | * | ||
| 264 | * load_stamp is the last time we updated the load average | ||
| 265 | * load_last is the last time we updated the load average and saw load | ||
| 266 | * load_unacc_exec_time is currently unaccounted execution time | ||
| 267 | */ | ||
| 268 | u64 load_avg; | ||
| 269 | u64 load_period; | ||
| 270 | u64 load_stamp, load_last, load_unacc_exec_time; | ||
| 271 | |||
| 272 | unsigned long load_contribution; | ||
| 273 | #endif /* CONFIG_SMP */ | ||
| 274 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 275 | int runtime_enabled; | ||
| 276 | u64 runtime_expires; | ||
| 277 | s64 runtime_remaining; | ||
| 278 | |||
| 279 | u64 throttled_timestamp; | ||
| 280 | int throttled, throttle_count; | ||
| 281 | struct list_head throttled_list; | ||
| 282 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
| 283 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 284 | }; | ||
| 285 | |||
| 286 | static inline int rt_bandwidth_enabled(void) | ||
| 287 | { | ||
| 288 | return sysctl_sched_rt_runtime >= 0; | ||
| 289 | } | ||
| 290 | |||
| 291 | /* Real-Time classes' related field in a runqueue: */ | ||
| 292 | struct rt_rq { | ||
| 293 | struct rt_prio_array active; | ||
| 294 | unsigned long rt_nr_running; | ||
| 295 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | ||
| 296 | struct { | ||
| 297 | int curr; /* highest queued rt task prio */ | ||
| 298 | #ifdef CONFIG_SMP | ||
| 299 | int next; /* next highest */ | ||
| 300 | #endif | ||
| 301 | } highest_prio; | ||
| 302 | #endif | ||
| 303 | #ifdef CONFIG_SMP | ||
| 304 | unsigned long rt_nr_migratory; | ||
| 305 | unsigned long rt_nr_total; | ||
| 306 | int overloaded; | ||
| 307 | struct plist_head pushable_tasks; | ||
| 308 | #endif | ||
| 309 | int rt_throttled; | ||
| 310 | u64 rt_time; | ||
| 311 | u64 rt_runtime; | ||
| 312 | /* Nests inside the rq lock: */ | ||
| 313 | raw_spinlock_t rt_runtime_lock; | ||
| 314 | |||
| 315 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 316 | unsigned long rt_nr_boosted; | ||
| 317 | |||
| 318 | struct rq *rq; | ||
| 319 | struct list_head leaf_rt_rq_list; | ||
| 320 | struct task_group *tg; | ||
| 321 | #endif | ||
| 322 | }; | ||
| 323 | |||
| 324 | #ifdef CONFIG_SMP | ||
| 325 | |||
| 326 | /* | ||
| 327 | * We add the notion of a root-domain which will be used to define per-domain | ||
| 328 | * variables. Each exclusive cpuset essentially defines an island domain by | ||
| 329 | * fully partitioning the member cpus from any other cpuset. Whenever a new | ||
| 330 | * exclusive cpuset is created, we also create and attach a new root-domain | ||
| 331 | * object. | ||
| 332 | * | ||
| 333 | */ | ||
| 334 | struct root_domain { | ||
| 335 | atomic_t refcount; | ||
| 336 | atomic_t rto_count; | ||
| 337 | struct rcu_head rcu; | ||
| 338 | cpumask_var_t span; | ||
| 339 | cpumask_var_t online; | ||
| 340 | |||
| 341 | /* | ||
| 342 | * The "RT overload" flag: it gets set if a CPU has more than | ||
| 343 | * one runnable RT task. | ||
| 344 | */ | ||
| 345 | cpumask_var_t rto_mask; | ||
| 346 | struct cpupri cpupri; | ||
| 347 | }; | ||
| 348 | |||
| 349 | extern struct root_domain def_root_domain; | ||
| 350 | |||
| 351 | #endif /* CONFIG_SMP */ | ||
| 352 | |||
| 353 | /* | ||
| 354 | * This is the main, per-CPU runqueue data structure. | ||
| 355 | * | ||
| 356 | * Locking rule: those places that want to lock multiple runqueues | ||
| 357 | * (such as the load balancing or the thread migration code), lock | ||
| 358 | * acquire operations must be ordered by ascending &runqueue. | ||
| 359 | */ | ||
| 360 | struct rq { | ||
| 361 | /* runqueue lock: */ | ||
| 362 | raw_spinlock_t lock; | ||
| 363 | |||
| 364 | /* | ||
| 365 | * nr_running and cpu_load should be in the same cacheline because | ||
| 366 | * remote CPUs use both these fields when doing load calculation. | ||
| 367 | */ | ||
| 368 | unsigned long nr_running; | ||
| 369 | #define CPU_LOAD_IDX_MAX 5 | ||
| 370 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | ||
| 371 | unsigned long last_load_update_tick; | ||
| 372 | #ifdef CONFIG_NO_HZ | ||
| 373 | u64 nohz_stamp; | ||
| 374 | unsigned long nohz_flags; | ||
| 375 | #endif | ||
| 376 | int skip_clock_update; | ||
| 377 | |||
| 378 | /* capture load from *all* tasks on this cpu: */ | ||
| 379 | struct load_weight load; | ||
| 380 | unsigned long nr_load_updates; | ||
| 381 | u64 nr_switches; | ||
| 382 | |||
| 383 | struct cfs_rq cfs; | ||
| 384 | struct rt_rq rt; | ||
| 385 | |||
| 386 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 387 | /* list of leaf cfs_rq on this cpu: */ | ||
| 388 | struct list_head leaf_cfs_rq_list; | ||
| 389 | #endif | ||
| 390 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 391 | struct list_head leaf_rt_rq_list; | ||
| 392 | #endif | ||
| 393 | |||
| 394 | /* | ||
| 395 | * This is part of a global counter where only the total sum | ||
| 396 | * over all CPUs matters. A task can increase this counter on | ||
| 397 | * one CPU and if it got migrated afterwards it may decrease | ||
| 398 | * it on another CPU. Always updated under the runqueue lock: | ||
| 399 | */ | ||
| 400 | unsigned long nr_uninterruptible; | ||
| 401 | |||
| 402 | struct task_struct *curr, *idle, *stop; | ||
| 403 | unsigned long next_balance; | ||
| 404 | struct mm_struct *prev_mm; | ||
| 405 | |||
| 406 | u64 clock; | ||
| 407 | u64 clock_task; | ||
| 408 | |||
| 409 | atomic_t nr_iowait; | ||
| 410 | |||
| 411 | #ifdef CONFIG_SMP | ||
| 412 | struct root_domain *rd; | ||
| 413 | struct sched_domain *sd; | ||
| 414 | |||
| 415 | unsigned long cpu_power; | ||
| 416 | |||
| 417 | unsigned char idle_balance; | ||
| 418 | /* For active balancing */ | ||
| 419 | int post_schedule; | ||
| 420 | int active_balance; | ||
| 421 | int push_cpu; | ||
| 422 | struct cpu_stop_work active_balance_work; | ||
| 423 | /* cpu of this runqueue: */ | ||
| 424 | int cpu; | ||
| 425 | int online; | ||
| 426 | |||
| 427 | u64 rt_avg; | ||
| 428 | u64 age_stamp; | ||
| 429 | u64 idle_stamp; | ||
| 430 | u64 avg_idle; | ||
| 431 | #endif | ||
| 432 | |||
| 433 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 434 | u64 prev_irq_time; | ||
| 435 | #endif | ||
| 436 | #ifdef CONFIG_PARAVIRT | ||
| 437 | u64 prev_steal_time; | ||
| 438 | #endif | ||
| 439 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
| 440 | u64 prev_steal_time_rq; | ||
| 441 | #endif | ||
| 442 | |||
| 443 | /* calc_load related fields */ | ||
| 444 | unsigned long calc_load_update; | ||
| 445 | long calc_load_active; | ||
| 446 | |||
| 447 | #ifdef CONFIG_SCHED_HRTICK | ||
| 448 | #ifdef CONFIG_SMP | ||
| 449 | int hrtick_csd_pending; | ||
| 450 | struct call_single_data hrtick_csd; | ||
| 451 | #endif | ||
| 452 | struct hrtimer hrtick_timer; | ||
| 453 | #endif | ||
| 454 | |||
| 455 | #ifdef CONFIG_SCHEDSTATS | ||
| 456 | /* latency stats */ | ||
| 457 | struct sched_info rq_sched_info; | ||
| 458 | unsigned long long rq_cpu_time; | ||
| 459 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ | ||
| 460 | |||
| 461 | /* sys_sched_yield() stats */ | ||
| 462 | unsigned int yld_count; | ||
| 463 | |||
| 464 | /* schedule() stats */ | ||
| 465 | unsigned int sched_switch; | ||
| 466 | unsigned int sched_count; | ||
| 467 | unsigned int sched_goidle; | ||
| 468 | |||
| 469 | /* try_to_wake_up() stats */ | ||
| 470 | unsigned int ttwu_count; | ||
| 471 | unsigned int ttwu_local; | ||
| 472 | #endif | ||
| 473 | |||
| 474 | #ifdef CONFIG_SMP | ||
| 475 | struct llist_head wake_list; | ||
| 476 | #endif | ||
| 477 | }; | ||
| 478 | |||
| 479 | static inline int cpu_of(struct rq *rq) | ||
| 480 | { | ||
| 481 | #ifdef CONFIG_SMP | ||
| 482 | return rq->cpu; | ||
| 483 | #else | ||
| 484 | return 0; | ||
| 485 | #endif | ||
| 486 | } | ||
| 487 | |||
| 488 | DECLARE_PER_CPU(struct rq, runqueues); | ||
| 489 | |||
| 490 | #define rcu_dereference_check_sched_domain(p) \ | ||
| 491 | rcu_dereference_check((p), \ | ||
| 492 | lockdep_is_held(&sched_domains_mutex)) | ||
| 493 | |||
| 494 | /* | ||
| 495 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | ||
| 496 | * See detach_destroy_domains: synchronize_sched for details. | ||
| 497 | * | ||
| 498 | * The domain tree of any CPU may only be accessed from within | ||
| 499 | * preempt-disabled sections. | ||
| 500 | */ | ||
| 501 | #define for_each_domain(cpu, __sd) \ | ||
| 502 | for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) | ||
| 503 | |||
| 504 | #define for_each_lower_domain(sd) for (; sd; sd = sd->child) | ||
| 505 | |||
| 506 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | ||
| 507 | #define this_rq() (&__get_cpu_var(runqueues)) | ||
| 508 | #define task_rq(p) cpu_rq(task_cpu(p)) | ||
| 509 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | ||
| 510 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
| 511 | |||
| 512 | #include "stats.h" | ||
| 513 | #include "auto_group.h" | ||
| 514 | |||
| 515 | #ifdef CONFIG_CGROUP_SCHED | ||
| 516 | |||
| 517 | /* | ||
| 518 | * Return the group to which this tasks belongs. | ||
| 519 | * | ||
| 520 | * We use task_subsys_state_check() and extend the RCU verification with | ||
| 521 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each | ||
| 522 | * task it moves into the cgroup. Therefore by holding either of those locks, | ||
| 523 | * we pin the task to the current cgroup. | ||
| 524 | */ | ||
| 525 | static inline struct task_group *task_group(struct task_struct *p) | ||
| 526 | { | ||
| 527 | struct task_group *tg; | ||
| 528 | struct cgroup_subsys_state *css; | ||
| 529 | |||
| 530 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | ||
| 531 | lockdep_is_held(&p->pi_lock) || | ||
| 532 | lockdep_is_held(&task_rq(p)->lock)); | ||
| 533 | tg = container_of(css, struct task_group, css); | ||
| 534 | |||
| 535 | return autogroup_task_group(p, tg); | ||
| 536 | } | ||
| 537 | |||
| 538 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | ||
| 539 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | ||
| 540 | { | ||
| 541 | #if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) | ||
| 542 | struct task_group *tg = task_group(p); | ||
| 543 | #endif | ||
| 544 | |||
| 545 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 546 | p->se.cfs_rq = tg->cfs_rq[cpu]; | ||
| 547 | p->se.parent = tg->se[cpu]; | ||
| 548 | #endif | ||
| 549 | |||
| 550 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 551 | p->rt.rt_rq = tg->rt_rq[cpu]; | ||
| 552 | p->rt.parent = tg->rt_se[cpu]; | ||
| 553 | #endif | ||
| 554 | } | ||
| 555 | |||
| 556 | #else /* CONFIG_CGROUP_SCHED */ | ||
| 557 | |||
| 558 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | ||
| 559 | static inline struct task_group *task_group(struct task_struct *p) | ||
| 560 | { | ||
| 561 | return NULL; | ||
| 562 | } | ||
| 563 | |||
| 564 | #endif /* CONFIG_CGROUP_SCHED */ | ||
| 565 | |||
| 566 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | ||
| 567 | { | ||
| 568 | set_task_rq(p, cpu); | ||
| 569 | #ifdef CONFIG_SMP | ||
| 570 | /* | ||
| 571 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | ||
| 572 | * successfuly executed on another CPU. We must ensure that updates of | ||
| 573 | * per-task data have been completed by this moment. | ||
| 574 | */ | ||
| 575 | smp_wmb(); | ||
| 576 | task_thread_info(p)->cpu = cpu; | ||
| 577 | #endif | ||
| 578 | } | ||
| 579 | |||
| 580 | /* | ||
| 581 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | ||
| 582 | */ | ||
| 583 | #ifdef CONFIG_SCHED_DEBUG | ||
| 584 | # include <linux/jump_label.h> | ||
| 585 | # define const_debug __read_mostly | ||
| 586 | #else | ||
| 587 | # define const_debug const | ||
| 588 | #endif | ||
| 589 | |||
| 590 | extern const_debug unsigned int sysctl_sched_features; | ||
| 591 | |||
| 592 | #define SCHED_FEAT(name, enabled) \ | ||
| 593 | __SCHED_FEAT_##name , | ||
| 594 | |||
| 595 | enum { | ||
| 596 | #include "features.h" | ||
| 597 | __SCHED_FEAT_NR, | ||
| 598 | }; | ||
| 599 | |||
| 600 | #undef SCHED_FEAT | ||
| 601 | |||
| 602 | #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) | ||
| 603 | static __always_inline bool static_branch__true(struct jump_label_key *key) | ||
| 604 | { | ||
| 605 | return likely(static_branch(key)); /* Not out of line branch. */ | ||
| 606 | } | ||
| 607 | |||
| 608 | static __always_inline bool static_branch__false(struct jump_label_key *key) | ||
| 609 | { | ||
| 610 | return unlikely(static_branch(key)); /* Out of line branch. */ | ||
| 611 | } | ||
| 612 | |||
| 613 | #define SCHED_FEAT(name, enabled) \ | ||
| 614 | static __always_inline bool static_branch_##name(struct jump_label_key *key) \ | ||
| 615 | { \ | ||
| 616 | return static_branch__##enabled(key); \ | ||
| 617 | } | ||
| 618 | |||
| 619 | #include "features.h" | ||
| 620 | |||
| 621 | #undef SCHED_FEAT | ||
| 622 | |||
| 623 | extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; | ||
| 624 | #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) | ||
| 625 | #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ | ||
| 626 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | ||
| 627 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ | ||
| 628 | |||
| 629 | static inline u64 global_rt_period(void) | ||
| 630 | { | ||
| 631 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | ||
| 632 | } | ||
| 633 | |||
| 634 | static inline u64 global_rt_runtime(void) | ||
| 635 | { | ||
| 636 | if (sysctl_sched_rt_runtime < 0) | ||
| 637 | return RUNTIME_INF; | ||
| 638 | |||
| 639 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | ||
| 640 | } | ||
| 641 | |||
| 642 | |||
| 643 | |||
| 644 | static inline int task_current(struct rq *rq, struct task_struct *p) | ||
| 645 | { | ||
| 646 | return rq->curr == p; | ||
| 647 | } | ||
| 648 | |||
| 649 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
| 650 | { | ||
| 651 | #ifdef CONFIG_SMP | ||
| 652 | return p->on_cpu; | ||
| 653 | #else | ||
| 654 | return task_current(rq, p); | ||
| 655 | #endif | ||
| 656 | } | ||
| 657 | |||
| 658 | |||
| 659 | #ifndef prepare_arch_switch | ||
| 660 | # define prepare_arch_switch(next) do { } while (0) | ||
| 661 | #endif | ||
| 662 | #ifndef finish_arch_switch | ||
| 663 | # define finish_arch_switch(prev) do { } while (0) | ||
| 664 | #endif | ||
| 665 | |||
| 666 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 667 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
| 668 | { | ||
| 669 | #ifdef CONFIG_SMP | ||
| 670 | /* | ||
| 671 | * We can optimise this out completely for !SMP, because the | ||
| 672 | * SMP rebalancing from interrupt is the only thing that cares | ||
| 673 | * here. | ||
| 674 | */ | ||
| 675 | next->on_cpu = 1; | ||
| 676 | #endif | ||
| 677 | } | ||
| 678 | |||
| 679 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
| 680 | { | ||
| 681 | #ifdef CONFIG_SMP | ||
| 682 | /* | ||
| 683 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
| 684 | * We must ensure this doesn't happen until the switch is completely | ||
| 685 | * finished. | ||
| 686 | */ | ||
| 687 | smp_wmb(); | ||
| 688 | prev->on_cpu = 0; | ||
| 689 | #endif | ||
| 690 | #ifdef CONFIG_DEBUG_SPINLOCK | ||
| 691 | /* this is a valid case when another task releases the spinlock */ | ||
| 692 | rq->lock.owner = current; | ||
| 693 | #endif | ||
| 694 | /* | ||
| 695 | * If we are tracking spinlock dependencies then we have to | ||
| 696 | * fix up the runqueue lock - which gets 'carried over' from | ||
| 697 | * prev into current: | ||
| 698 | */ | ||
| 699 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | ||
| 700 | |||
| 701 | raw_spin_unlock_irq(&rq->lock); | ||
| 702 | } | ||
| 703 | |||
| 704 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
| 705 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
| 706 | { | ||
| 707 | #ifdef CONFIG_SMP | ||
| 708 | /* | ||
| 709 | * We can optimise this out completely for !SMP, because the | ||
| 710 | * SMP rebalancing from interrupt is the only thing that cares | ||
| 711 | * here. | ||
| 712 | */ | ||
| 713 | next->on_cpu = 1; | ||
| 714 | #endif | ||
| 715 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 716 | raw_spin_unlock_irq(&rq->lock); | ||
| 717 | #else | ||
| 718 | raw_spin_unlock(&rq->lock); | ||
| 719 | #endif | ||
| 720 | } | ||
| 721 | |||
| 722 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
| 723 | { | ||
| 724 | #ifdef CONFIG_SMP | ||
| 725 | /* | ||
| 726 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
| 727 | * We must ensure this doesn't happen until the switch is completely | ||
| 728 | * finished. | ||
| 729 | */ | ||
| 730 | smp_wmb(); | ||
| 731 | prev->on_cpu = 0; | ||
| 732 | #endif | ||
| 733 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 734 | local_irq_enable(); | ||
| 735 | #endif | ||
| 736 | } | ||
| 737 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
| 738 | |||
| 739 | |||
| 740 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | ||
| 741 | { | ||
| 742 | lw->weight += inc; | ||
| 743 | lw->inv_weight = 0; | ||
| 744 | } | ||
| 745 | |||
| 746 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | ||
| 747 | { | ||
| 748 | lw->weight -= dec; | ||
| 749 | lw->inv_weight = 0; | ||
| 750 | } | ||
| 751 | |||
| 752 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
| 753 | { | ||
| 754 | lw->weight = w; | ||
| 755 | lw->inv_weight = 0; | ||
| 756 | } | ||
| 757 | |||
| 758 | /* | ||
| 759 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
| 760 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
| 761 | * each task makes to its run queue's load is weighted according to its | ||
| 762 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | ||
| 763 | * scaled version of the new time slice allocation that they receive on time | ||
| 764 | * slice expiry etc. | ||
| 765 | */ | ||
| 766 | |||
| 767 | #define WEIGHT_IDLEPRIO 3 | ||
| 768 | #define WMULT_IDLEPRIO 1431655765 | ||
| 769 | |||
| 770 | /* | ||
| 771 | * Nice levels are multiplicative, with a gentle 10% change for every | ||
| 772 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to | ||
| 773 | * nice 1, it will get ~10% less CPU time than another CPU-bound task | ||
| 774 | * that remained on nice 0. | ||
| 775 | * | ||
| 776 | * The "10% effect" is relative and cumulative: from _any_ nice level, | ||
| 777 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level | ||
| 778 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. | ||
| 779 | * If a task goes up by ~10% and another task goes down by ~10% then | ||
| 780 | * the relative distance between them is ~25%.) | ||
| 781 | */ | ||
| 782 | static const int prio_to_weight[40] = { | ||
| 783 | /* -20 */ 88761, 71755, 56483, 46273, 36291, | ||
| 784 | /* -15 */ 29154, 23254, 18705, 14949, 11916, | ||
| 785 | /* -10 */ 9548, 7620, 6100, 4904, 3906, | ||
| 786 | /* -5 */ 3121, 2501, 1991, 1586, 1277, | ||
| 787 | /* 0 */ 1024, 820, 655, 526, 423, | ||
| 788 | /* 5 */ 335, 272, 215, 172, 137, | ||
| 789 | /* 10 */ 110, 87, 70, 56, 45, | ||
| 790 | /* 15 */ 36, 29, 23, 18, 15, | ||
| 791 | }; | ||
| 792 | |||
| 793 | /* | ||
| 794 | * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. | ||
| 795 | * | ||
| 796 | * In cases where the weight does not change often, we can use the | ||
| 797 | * precalculated inverse to speed up arithmetics by turning divisions | ||
| 798 | * into multiplications: | ||
| 799 | */ | ||
| 800 | static const u32 prio_to_wmult[40] = { | ||
| 801 | /* -20 */ 48388, 59856, 76040, 92818, 118348, | ||
| 802 | /* -15 */ 147320, 184698, 229616, 287308, 360437, | ||
| 803 | /* -10 */ 449829, 563644, 704093, 875809, 1099582, | ||
| 804 | /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, | ||
| 805 | /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, | ||
| 806 | /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, | ||
| 807 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, | ||
| 808 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | ||
| 809 | }; | ||
| 810 | |||
| 811 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | ||
| 812 | enum cpuacct_stat_index { | ||
| 813 | CPUACCT_STAT_USER, /* ... user mode */ | ||
| 814 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | ||
| 815 | |||
| 816 | CPUACCT_STAT_NSTATS, | ||
| 817 | }; | ||
| 818 | |||
| 819 | |||
| 820 | #define sched_class_highest (&stop_sched_class) | ||
| 821 | #define for_each_class(class) \ | ||
| 822 | for (class = sched_class_highest; class; class = class->next) | ||
| 823 | |||
| 824 | extern const struct sched_class stop_sched_class; | ||
| 825 | extern const struct sched_class rt_sched_class; | ||
| 826 | extern const struct sched_class fair_sched_class; | ||
| 827 | extern const struct sched_class idle_sched_class; | ||
| 828 | |||
| 829 | |||
| 830 | #ifdef CONFIG_SMP | ||
| 831 | |||
| 832 | extern void trigger_load_balance(struct rq *rq, int cpu); | ||
| 833 | extern void idle_balance(int this_cpu, struct rq *this_rq); | ||
| 834 | |||
| 835 | #else /* CONFIG_SMP */ | ||
| 836 | |||
| 837 | static inline void idle_balance(int cpu, struct rq *rq) | ||
| 838 | { | ||
| 839 | } | ||
| 840 | |||
| 841 | #endif | ||
| 842 | |||
| 843 | extern void sysrq_sched_debug_show(void); | ||
| 844 | extern void sched_init_granularity(void); | ||
| 845 | extern void update_max_interval(void); | ||
| 846 | extern void update_group_power(struct sched_domain *sd, int cpu); | ||
| 847 | extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); | ||
| 848 | extern void init_sched_rt_class(void); | ||
| 849 | extern void init_sched_fair_class(void); | ||
| 850 | |||
| 851 | extern void resched_task(struct task_struct *p); | ||
| 852 | extern void resched_cpu(int cpu); | ||
| 853 | |||
| 854 | extern struct rt_bandwidth def_rt_bandwidth; | ||
| 855 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); | ||
| 856 | |||
| 857 | extern void update_cpu_load(struct rq *this_rq); | ||
| 858 | |||
| 859 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 860 | #include <linux/cgroup.h> | ||
| 861 | /* track cpu usage of a group of tasks and its child groups */ | ||
| 862 | struct cpuacct { | ||
| 863 | struct cgroup_subsys_state css; | ||
| 864 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
| 865 | u64 __percpu *cpuusage; | ||
| 866 | struct kernel_cpustat __percpu *cpustat; | ||
| 867 | }; | ||
| 868 | |||
| 869 | /* return cpu accounting group corresponding to this container */ | ||
| 870 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
| 871 | { | ||
| 872 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | ||
| 873 | struct cpuacct, css); | ||
| 874 | } | ||
| 875 | |||
| 876 | /* return cpu accounting group to which this task belongs */ | ||
| 877 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
| 878 | { | ||
| 879 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
| 880 | struct cpuacct, css); | ||
| 881 | } | ||
| 882 | |||
| 883 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | ||
| 884 | { | ||
| 885 | if (!ca || !ca->css.cgroup->parent) | ||
| 886 | return NULL; | ||
| 887 | return cgroup_ca(ca->css.cgroup->parent); | ||
| 888 | } | ||
| 889 | |||
| 890 | extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); | ||
| 891 | #else | ||
| 892 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | ||
| 893 | #endif | ||
| 894 | |||
| 895 | static inline void inc_nr_running(struct rq *rq) | ||
| 896 | { | ||
| 897 | rq->nr_running++; | ||
| 898 | } | ||
| 899 | |||
| 900 | static inline void dec_nr_running(struct rq *rq) | ||
| 901 | { | ||
| 902 | rq->nr_running--; | ||
| 903 | } | ||
| 904 | |||
| 905 | extern void update_rq_clock(struct rq *rq); | ||
| 906 | |||
| 907 | extern void activate_task(struct rq *rq, struct task_struct *p, int flags); | ||
| 908 | extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); | ||
| 909 | |||
| 910 | extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); | ||
| 911 | |||
| 912 | extern const_debug unsigned int sysctl_sched_time_avg; | ||
| 913 | extern const_debug unsigned int sysctl_sched_nr_migrate; | ||
| 914 | extern const_debug unsigned int sysctl_sched_migration_cost; | ||
| 915 | |||
| 916 | static inline u64 sched_avg_period(void) | ||
| 917 | { | ||
| 918 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | ||
| 919 | } | ||
| 920 | |||
| 921 | void calc_load_account_idle(struct rq *this_rq); | ||
| 922 | |||
| 923 | #ifdef CONFIG_SCHED_HRTICK | ||
| 924 | |||
| 925 | /* | ||
| 926 | * Use hrtick when: | ||
| 927 | * - enabled by features | ||
| 928 | * - hrtimer is actually high res | ||
| 929 | */ | ||
| 930 | static inline int hrtick_enabled(struct rq *rq) | ||
| 931 | { | ||
| 932 | if (!sched_feat(HRTICK)) | ||
| 933 | return 0; | ||
| 934 | if (!cpu_active(cpu_of(rq))) | ||
| 935 | return 0; | ||
| 936 | return hrtimer_is_hres_active(&rq->hrtick_timer); | ||
| 937 | } | ||
| 938 | |||
| 939 | void hrtick_start(struct rq *rq, u64 delay); | ||
| 940 | |||
| 941 | #else | ||
| 942 | |||
| 943 | static inline int hrtick_enabled(struct rq *rq) | ||
| 944 | { | ||
| 945 | return 0; | ||
| 946 | } | ||
| 947 | |||
| 948 | #endif /* CONFIG_SCHED_HRTICK */ | ||
| 949 | |||
| 950 | #ifdef CONFIG_SMP | ||
| 951 | extern void sched_avg_update(struct rq *rq); | ||
| 952 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
| 953 | { | ||
| 954 | rq->rt_avg += rt_delta; | ||
| 955 | sched_avg_update(rq); | ||
| 956 | } | ||
| 957 | #else | ||
| 958 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } | ||
| 959 | static inline void sched_avg_update(struct rq *rq) { } | ||
| 960 | #endif | ||
| 961 | |||
| 962 | extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); | ||
| 963 | |||
| 964 | #ifdef CONFIG_SMP | ||
| 965 | #ifdef CONFIG_PREEMPT | ||
| 966 | |||
| 967 | static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); | ||
| 968 | |||
| 969 | /* | ||
| 970 | * fair double_lock_balance: Safely acquires both rq->locks in a fair | ||
| 971 | * way at the expense of forcing extra atomic operations in all | ||
| 972 | * invocations. This assures that the double_lock is acquired using the | ||
| 973 | * same underlying policy as the spinlock_t on this architecture, which | ||
| 974 | * reduces latency compared to the unfair variant below. However, it | ||
| 975 | * also adds more overhead and therefore may reduce throughput. | ||
| 976 | */ | ||
| 977 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
| 978 | __releases(this_rq->lock) | ||
| 979 | __acquires(busiest->lock) | ||
| 980 | __acquires(this_rq->lock) | ||
| 981 | { | ||
| 982 | raw_spin_unlock(&this_rq->lock); | ||
| 983 | double_rq_lock(this_rq, busiest); | ||
| 984 | |||
| 985 | return 1; | ||
| 986 | } | ||
| 987 | |||
| 988 | #else | ||
| 989 | /* | ||
| 990 | * Unfair double_lock_balance: Optimizes throughput at the expense of | ||
| 991 | * latency by eliminating extra atomic operations when the locks are | ||
| 992 | * already in proper order on entry. This favors lower cpu-ids and will | ||
| 993 | * grant the double lock to lower cpus over higher ids under contention, | ||
| 994 | * regardless of entry order into the function. | ||
| 995 | */ | ||
| 996 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
| 997 | __releases(this_rq->lock) | ||
| 998 | __acquires(busiest->lock) | ||
| 999 | __acquires(this_rq->lock) | ||
| 1000 | { | ||
| 1001 | int ret = 0; | ||
| 1002 | |||
| 1003 | if (unlikely(!raw_spin_trylock(&busiest->lock))) { | ||
| 1004 | if (busiest < this_rq) { | ||
| 1005 | raw_spin_unlock(&this_rq->lock); | ||
| 1006 | raw_spin_lock(&busiest->lock); | ||
| 1007 | raw_spin_lock_nested(&this_rq->lock, | ||
| 1008 | SINGLE_DEPTH_NESTING); | ||
| 1009 | ret = 1; | ||
| 1010 | } else | ||
| 1011 | raw_spin_lock_nested(&busiest->lock, | ||
| 1012 | SINGLE_DEPTH_NESTING); | ||
| 1013 | } | ||
| 1014 | return ret; | ||
| 1015 | } | ||
| 1016 | |||
| 1017 | #endif /* CONFIG_PREEMPT */ | ||
| 1018 | |||
| 1019 | /* | ||
| 1020 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | ||
| 1021 | */ | ||
| 1022 | static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
| 1023 | { | ||
| 1024 | if (unlikely(!irqs_disabled())) { | ||
| 1025 | /* printk() doesn't work good under rq->lock */ | ||
| 1026 | raw_spin_unlock(&this_rq->lock); | ||
| 1027 | BUG_ON(1); | ||
| 1028 | } | ||
| 1029 | |||
| 1030 | return _double_lock_balance(this_rq, busiest); | ||
| 1031 | } | ||
| 1032 | |||
| 1033 | static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | ||
| 1034 | __releases(busiest->lock) | ||
| 1035 | { | ||
| 1036 | raw_spin_unlock(&busiest->lock); | ||
| 1037 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | ||
| 1038 | } | ||
| 1039 | |||
| 1040 | /* | ||
| 1041 | * double_rq_lock - safely lock two runqueues | ||
| 1042 | * | ||
| 1043 | * Note this does not disable interrupts like task_rq_lock, | ||
| 1044 | * you need to do so manually before calling. | ||
| 1045 | */ | ||
| 1046 | static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
| 1047 | __acquires(rq1->lock) | ||
| 1048 | __acquires(rq2->lock) | ||
| 1049 | { | ||
| 1050 | BUG_ON(!irqs_disabled()); | ||
| 1051 | if (rq1 == rq2) { | ||
| 1052 | raw_spin_lock(&rq1->lock); | ||
| 1053 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
| 1054 | } else { | ||
| 1055 | if (rq1 < rq2) { | ||
| 1056 | raw_spin_lock(&rq1->lock); | ||
| 1057 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | ||
| 1058 | } else { | ||
| 1059 | raw_spin_lock(&rq2->lock); | ||
| 1060 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | ||
| 1061 | } | ||
| 1062 | } | ||
| 1063 | } | ||
| 1064 | |||
| 1065 | /* | ||
| 1066 | * double_rq_unlock - safely unlock two runqueues | ||
| 1067 | * | ||
| 1068 | * Note this does not restore interrupts like task_rq_unlock, | ||
| 1069 | * you need to do so manually after calling. | ||
| 1070 | */ | ||
| 1071 | static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
| 1072 | __releases(rq1->lock) | ||
| 1073 | __releases(rq2->lock) | ||
| 1074 | { | ||
| 1075 | raw_spin_unlock(&rq1->lock); | ||
| 1076 | if (rq1 != rq2) | ||
| 1077 | raw_spin_unlock(&rq2->lock); | ||
| 1078 | else | ||
| 1079 | __release(rq2->lock); | ||
| 1080 | } | ||
| 1081 | |||
| 1082 | #else /* CONFIG_SMP */ | ||
| 1083 | |||
| 1084 | /* | ||
| 1085 | * double_rq_lock - safely lock two runqueues | ||
| 1086 | * | ||
| 1087 | * Note this does not disable interrupts like task_rq_lock, | ||
| 1088 | * you need to do so manually before calling. | ||
| 1089 | */ | ||
| 1090 | static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
| 1091 | __acquires(rq1->lock) | ||
| 1092 | __acquires(rq2->lock) | ||
| 1093 | { | ||
| 1094 | BUG_ON(!irqs_disabled()); | ||
| 1095 | BUG_ON(rq1 != rq2); | ||
| 1096 | raw_spin_lock(&rq1->lock); | ||
| 1097 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
| 1098 | } | ||
| 1099 | |||
| 1100 | /* | ||
| 1101 | * double_rq_unlock - safely unlock two runqueues | ||
| 1102 | * | ||
| 1103 | * Note this does not restore interrupts like task_rq_unlock, | ||
| 1104 | * you need to do so manually after calling. | ||
| 1105 | */ | ||
| 1106 | static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
| 1107 | __releases(rq1->lock) | ||
| 1108 | __releases(rq2->lock) | ||
| 1109 | { | ||
| 1110 | BUG_ON(rq1 != rq2); | ||
| 1111 | raw_spin_unlock(&rq1->lock); | ||
| 1112 | __release(rq2->lock); | ||
| 1113 | } | ||
| 1114 | |||
| 1115 | #endif | ||
| 1116 | |||
| 1117 | extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); | ||
| 1118 | extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); | ||
| 1119 | extern void print_cfs_stats(struct seq_file *m, int cpu); | ||
| 1120 | extern void print_rt_stats(struct seq_file *m, int cpu); | ||
| 1121 | |||
| 1122 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | ||
| 1123 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | ||
| 1124 | extern void unthrottle_offline_cfs_rqs(struct rq *rq); | ||
| 1125 | |||
| 1126 | extern void account_cfs_bandwidth_used(int enabled, int was_enabled); | ||
| 1127 | |||
| 1128 | #ifdef CONFIG_NO_HZ | ||
| 1129 | enum rq_nohz_flag_bits { | ||
| 1130 | NOHZ_TICK_STOPPED, | ||
| 1131 | NOHZ_BALANCE_KICK, | ||
| 1132 | NOHZ_IDLE, | ||
| 1133 | }; | ||
| 1134 | |||
| 1135 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) | ||
| 1136 | #endif | ||
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c new file mode 100644 index 000000000000..2a581ba8e190 --- /dev/null +++ b/kernel/sched/stats.c | |||
| @@ -0,0 +1,111 @@ | |||
| 1 | |||
| 2 | #include <linux/slab.h> | ||
| 3 | #include <linux/fs.h> | ||
| 4 | #include <linux/seq_file.h> | ||
| 5 | #include <linux/proc_fs.h> | ||
| 6 | |||
| 7 | #include "sched.h" | ||
| 8 | |||
| 9 | /* | ||
| 10 | * bump this up when changing the output format or the meaning of an existing | ||
| 11 | * format, so that tools can adapt (or abort) | ||
| 12 | */ | ||
| 13 | #define SCHEDSTAT_VERSION 15 | ||
| 14 | |||
| 15 | static int show_schedstat(struct seq_file *seq, void *v) | ||
| 16 | { | ||
| 17 | int cpu; | ||
| 18 | int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; | ||
| 19 | char *mask_str = kmalloc(mask_len, GFP_KERNEL); | ||
| 20 | |||
| 21 | if (mask_str == NULL) | ||
| 22 | return -ENOMEM; | ||
| 23 | |||
| 24 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | ||
| 25 | seq_printf(seq, "timestamp %lu\n", jiffies); | ||
| 26 | for_each_online_cpu(cpu) { | ||
| 27 | struct rq *rq = cpu_rq(cpu); | ||
| 28 | #ifdef CONFIG_SMP | ||
| 29 | struct sched_domain *sd; | ||
| 30 | int dcount = 0; | ||
| 31 | #endif | ||
| 32 | |||
| 33 | /* runqueue-specific stats */ | ||
| 34 | seq_printf(seq, | ||
| 35 | "cpu%d %u %u %u %u %u %u %llu %llu %lu", | ||
| 36 | cpu, rq->yld_count, | ||
| 37 | rq->sched_switch, rq->sched_count, rq->sched_goidle, | ||
| 38 | rq->ttwu_count, rq->ttwu_local, | ||
| 39 | rq->rq_cpu_time, | ||
| 40 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); | ||
| 41 | |||
| 42 | seq_printf(seq, "\n"); | ||
| 43 | |||
| 44 | #ifdef CONFIG_SMP | ||
| 45 | /* domain-specific stats */ | ||
| 46 | rcu_read_lock(); | ||
| 47 | for_each_domain(cpu, sd) { | ||
| 48 | enum cpu_idle_type itype; | ||
| 49 | |||
| 50 | cpumask_scnprintf(mask_str, mask_len, | ||
| 51 | sched_domain_span(sd)); | ||
| 52 | seq_printf(seq, "domain%d %s", dcount++, mask_str); | ||
| 53 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; | ||
| 54 | itype++) { | ||
| 55 | seq_printf(seq, " %u %u %u %u %u %u %u %u", | ||
| 56 | sd->lb_count[itype], | ||
| 57 | sd->lb_balanced[itype], | ||
| 58 | sd->lb_failed[itype], | ||
| 59 | sd->lb_imbalance[itype], | ||
| 60 | sd->lb_gained[itype], | ||
| 61 | sd->lb_hot_gained[itype], | ||
| 62 | sd->lb_nobusyq[itype], | ||
| 63 | sd->lb_nobusyg[itype]); | ||
| 64 | } | ||
| 65 | seq_printf(seq, | ||
| 66 | " %u %u %u %u %u %u %u %u %u %u %u %u\n", | ||
| 67 | sd->alb_count, sd->alb_failed, sd->alb_pushed, | ||
| 68 | sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, | ||
| 69 | sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, | ||
| 70 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | ||
| 71 | sd->ttwu_move_balance); | ||
| 72 | } | ||
| 73 | rcu_read_unlock(); | ||
| 74 | #endif | ||
| 75 | } | ||
| 76 | kfree(mask_str); | ||
| 77 | return 0; | ||
| 78 | } | ||
| 79 | |||
| 80 | static int schedstat_open(struct inode *inode, struct file *file) | ||
| 81 | { | ||
| 82 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); | ||
| 83 | char *buf = kmalloc(size, GFP_KERNEL); | ||
| 84 | struct seq_file *m; | ||
| 85 | int res; | ||
| 86 | |||
| 87 | if (!buf) | ||
| 88 | return -ENOMEM; | ||
| 89 | res = single_open(file, show_schedstat, NULL); | ||
| 90 | if (!res) { | ||
| 91 | m = file->private_data; | ||
| 92 | m->buf = buf; | ||
| 93 | m->size = size; | ||
| 94 | } else | ||
| 95 | kfree(buf); | ||
| 96 | return res; | ||
| 97 | } | ||
| 98 | |||
| 99 | static const struct file_operations proc_schedstat_operations = { | ||
| 100 | .open = schedstat_open, | ||
| 101 | .read = seq_read, | ||
| 102 | .llseek = seq_lseek, | ||
| 103 | .release = single_release, | ||
| 104 | }; | ||
| 105 | |||
| 106 | static int __init proc_schedstat_init(void) | ||
| 107 | { | ||
| 108 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); | ||
| 109 | return 0; | ||
| 110 | } | ||
| 111 | module_init(proc_schedstat_init); | ||
diff --git a/kernel/sched_stats.h b/kernel/sched/stats.h index 4b71dbef271d..2ef90a51ec5e 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched/stats.h | |||
| @@ -1,108 +1,5 @@ | |||
| 1 | 1 | ||
| 2 | #ifdef CONFIG_SCHEDSTATS | 2 | #ifdef CONFIG_SCHEDSTATS |
| 3 | /* | ||
| 4 | * bump this up when changing the output format or the meaning of an existing | ||
| 5 | * format, so that tools can adapt (or abort) | ||
| 6 | */ | ||
| 7 | #define SCHEDSTAT_VERSION 15 | ||
| 8 | |||
| 9 | static int show_schedstat(struct seq_file *seq, void *v) | ||
| 10 | { | ||
| 11 | int cpu; | ||
| 12 | int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; | ||
| 13 | char *mask_str = kmalloc(mask_len, GFP_KERNEL); | ||
| 14 | |||
| 15 | if (mask_str == NULL) | ||
| 16 | return -ENOMEM; | ||
| 17 | |||
| 18 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | ||
| 19 | seq_printf(seq, "timestamp %lu\n", jiffies); | ||
| 20 | for_each_online_cpu(cpu) { | ||
| 21 | struct rq *rq = cpu_rq(cpu); | ||
| 22 | #ifdef CONFIG_SMP | ||
| 23 | struct sched_domain *sd; | ||
| 24 | int dcount = 0; | ||
| 25 | #endif | ||
| 26 | |||
| 27 | /* runqueue-specific stats */ | ||
| 28 | seq_printf(seq, | ||
| 29 | "cpu%d %u %u %u %u %u %u %llu %llu %lu", | ||
| 30 | cpu, rq->yld_count, | ||
| 31 | rq->sched_switch, rq->sched_count, rq->sched_goidle, | ||
| 32 | rq->ttwu_count, rq->ttwu_local, | ||
| 33 | rq->rq_cpu_time, | ||
| 34 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); | ||
| 35 | |||
| 36 | seq_printf(seq, "\n"); | ||
| 37 | |||
| 38 | #ifdef CONFIG_SMP | ||
| 39 | /* domain-specific stats */ | ||
| 40 | rcu_read_lock(); | ||
| 41 | for_each_domain(cpu, sd) { | ||
| 42 | enum cpu_idle_type itype; | ||
| 43 | |||
| 44 | cpumask_scnprintf(mask_str, mask_len, | ||
| 45 | sched_domain_span(sd)); | ||
| 46 | seq_printf(seq, "domain%d %s", dcount++, mask_str); | ||
| 47 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; | ||
| 48 | itype++) { | ||
| 49 | seq_printf(seq, " %u %u %u %u %u %u %u %u", | ||
| 50 | sd->lb_count[itype], | ||
| 51 | sd->lb_balanced[itype], | ||
| 52 | sd->lb_failed[itype], | ||
| 53 | sd->lb_imbalance[itype], | ||
| 54 | sd->lb_gained[itype], | ||
| 55 | sd->lb_hot_gained[itype], | ||
| 56 | sd->lb_nobusyq[itype], | ||
| 57 | sd->lb_nobusyg[itype]); | ||
| 58 | } | ||
| 59 | seq_printf(seq, | ||
| 60 | " %u %u %u %u %u %u %u %u %u %u %u %u\n", | ||
| 61 | sd->alb_count, sd->alb_failed, sd->alb_pushed, | ||
| 62 | sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, | ||
| 63 | sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, | ||
| 64 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | ||
| 65 | sd->ttwu_move_balance); | ||
| 66 | } | ||
| 67 | rcu_read_unlock(); | ||
| 68 | #endif | ||
| 69 | } | ||
| 70 | kfree(mask_str); | ||
| 71 | return 0; | ||
| 72 | } | ||
| 73 | |||
| 74 | static int schedstat_open(struct inode *inode, struct file *file) | ||
| 75 | { | ||
| 76 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); | ||
| 77 | char *buf = kmalloc(size, GFP_KERNEL); | ||
| 78 | struct seq_file *m; | ||
| 79 | int res; | ||
| 80 | |||
| 81 | if (!buf) | ||
| 82 | return -ENOMEM; | ||
| 83 | res = single_open(file, show_schedstat, NULL); | ||
| 84 | if (!res) { | ||
| 85 | m = file->private_data; | ||
| 86 | m->buf = buf; | ||
| 87 | m->size = size; | ||
| 88 | } else | ||
| 89 | kfree(buf); | ||
| 90 | return res; | ||
| 91 | } | ||
| 92 | |||
| 93 | static const struct file_operations proc_schedstat_operations = { | ||
| 94 | .open = schedstat_open, | ||
| 95 | .read = seq_read, | ||
| 96 | .llseek = seq_lseek, | ||
| 97 | .release = single_release, | ||
| 98 | }; | ||
| 99 | |||
| 100 | static int __init proc_schedstat_init(void) | ||
| 101 | { | ||
| 102 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); | ||
| 103 | return 0; | ||
| 104 | } | ||
| 105 | module_init(proc_schedstat_init); | ||
| 106 | 3 | ||
| 107 | /* | 4 | /* |
| 108 | * Expects runqueue lock to be held for atomicity of update | 5 | * Expects runqueue lock to be held for atomicity of update |
diff --git a/kernel/sched_stoptask.c b/kernel/sched/stop_task.c index 8b44e7fa7fb3..7b386e86fd23 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched/stop_task.c | |||
| @@ -1,3 +1,5 @@ | |||
| 1 | #include "sched.h" | ||
| 2 | |||
| 1 | /* | 3 | /* |
| 2 | * stop-task scheduling class. | 4 | * stop-task scheduling class. |
| 3 | * | 5 | * |
| @@ -80,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) | |||
| 80 | /* | 82 | /* |
| 81 | * Simple, special scheduling class for the per-CPU stop tasks: | 83 | * Simple, special scheduling class for the per-CPU stop tasks: |
| 82 | */ | 84 | */ |
| 83 | static const struct sched_class stop_sched_class = { | 85 | const struct sched_class stop_sched_class = { |
| 84 | .next = &rt_sched_class, | 86 | .next = &rt_sched_class, |
| 85 | 87 | ||
| 86 | .enqueue_task = enqueue_task_stop, | 88 | .enqueue_task = enqueue_task_stop, |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 40420644d0ba..31cc06163ed5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -297,6 +297,15 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
| 297 | ts = &per_cpu(tick_cpu_sched, cpu); | 297 | ts = &per_cpu(tick_cpu_sched, cpu); |
| 298 | 298 | ||
| 299 | /* | 299 | /* |
| 300 | * Update the idle state in the scheduler domain hierarchy | ||
| 301 | * when tick_nohz_stop_sched_tick() is called from the idle loop. | ||
| 302 | * State will be updated to busy during the first busy tick after | ||
| 303 | * exiting idle. | ||
| 304 | */ | ||
| 305 | if (inidle) | ||
| 306 | set_cpu_sd_state_idle(); | ||
| 307 | |||
| 308 | /* | ||
| 300 | * Call to tick_nohz_start_idle stops the last_update_time from being | 309 | * Call to tick_nohz_start_idle stops the last_update_time from being |
| 301 | * updated. Thus, it must not be called in the event we are called from | 310 | * updated. Thus, it must not be called in the event we are called from |
| 302 | * irq_exit() with the prior state different than idle. | 311 | * irq_exit() with the prior state different than idle. |
