diff options
Diffstat (limited to 'kernel')
56 files changed, 3134 insertions, 1272 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 1ce47553fb02..a4d1aa8da9bc 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -6,9 +6,9 @@ obj-y = fork.o exec_domain.o panic.o \ | |||
| 6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ | 6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ |
| 7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ |
| 8 | signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ |
| 9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | extable.o params.o posix-timers.o \ |
| 10 | kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o sys_ni.o posix-cpu-timers.o mutex.o \ |
| 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o semaphore.o \ |
| 12 | notifier.o ksysfs.o cred.o reboot.o \ | 12 | notifier.o ksysfs.o cred.o reboot.o \ |
| 13 | async.o range.o groups.o lglock.o smpboot.o | 13 | async.o range.o groups.o lglock.o smpboot.o |
| 14 | 14 | ||
| @@ -27,6 +27,7 @@ obj-y += power/ | |||
| 27 | obj-y += printk/ | 27 | obj-y += printk/ |
| 28 | obj-y += cpu/ | 28 | obj-y += cpu/ |
| 29 | obj-y += irq/ | 29 | obj-y += irq/ |
| 30 | obj-y += rcu/ | ||
| 30 | 31 | ||
| 31 | obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o | 32 | obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o |
| 32 | obj-$(CONFIG_FREEZER) += freezer.o | 33 | obj-$(CONFIG_FREEZER) += freezer.o |
| @@ -81,12 +82,6 @@ obj-$(CONFIG_KGDB) += debug/ | |||
| 81 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o | 82 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o |
| 82 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o | 83 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o |
| 83 | obj-$(CONFIG_SECCOMP) += seccomp.o | 84 | obj-$(CONFIG_SECCOMP) += seccomp.o |
| 84 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | ||
| 85 | obj-$(CONFIG_TREE_RCU) += rcutree.o | ||
| 86 | obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o | ||
| 87 | obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o | ||
| 88 | obj-$(CONFIG_TINY_RCU) += rcutiny.o | ||
| 89 | obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o | ||
| 90 | obj-$(CONFIG_RELAY) += relay.o | 85 | obj-$(CONFIG_RELAY) += relay.o |
| 91 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 86 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
| 92 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | 87 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o |
diff --git a/kernel/bounds.c b/kernel/bounds.c index 0c9b862292b2..e8ca97b5c386 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c | |||
| @@ -10,6 +10,7 @@ | |||
| 10 | #include <linux/mmzone.h> | 10 | #include <linux/mmzone.h> |
| 11 | #include <linux/kbuild.h> | 11 | #include <linux/kbuild.h> |
| 12 | #include <linux/page_cgroup.h> | 12 | #include <linux/page_cgroup.h> |
| 13 | #include <linux/log2.h> | ||
| 13 | 14 | ||
| 14 | void foo(void) | 15 | void foo(void) |
| 15 | { | 16 | { |
| @@ -17,5 +18,8 @@ void foo(void) | |||
| 17 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); | 18 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); |
| 18 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); | 19 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); |
| 19 | DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); | 20 | DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); |
| 21 | #ifdef CONFIG_SMP | ||
| 22 | DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); | ||
| 23 | #endif | ||
| 20 | /* End of constants */ | 24 | /* End of constants */ |
| 21 | } | 25 | } |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 859c8dfd78a1..e5f3917aa05b 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
| @@ -120,7 +120,7 @@ void context_tracking_user_enter(void) | |||
| 120 | * instead of preempt_schedule() to exit user context if needed before | 120 | * instead of preempt_schedule() to exit user context if needed before |
| 121 | * calling the scheduler. | 121 | * calling the scheduler. |
| 122 | */ | 122 | */ |
| 123 | void __sched notrace preempt_schedule_context(void) | 123 | asmlinkage void __sched notrace preempt_schedule_context(void) |
| 124 | { | 124 | { |
| 125 | enum ctx_state prev_ctx; | 125 | enum ctx_state prev_ctx; |
| 126 | 126 | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index d7f07a2da5a6..63aa50d7ce1e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -308,6 +308,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 308 | } | 308 | } |
| 309 | smpboot_park_threads(cpu); | 309 | smpboot_park_threads(cpu); |
| 310 | 310 | ||
| 311 | /* | ||
| 312 | * By now we've cleared cpu_active_mask, wait for all preempt-disabled | ||
| 313 | * and RCU users of this state to go away such that all new such users | ||
| 314 | * will observe it. | ||
| 315 | * | ||
| 316 | * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might | ||
| 317 | * not imply sync_sched(), so explicitly call both. | ||
| 318 | */ | ||
| 319 | #ifdef CONFIG_PREEMPT | ||
| 320 | synchronize_sched(); | ||
| 321 | #endif | ||
| 322 | synchronize_rcu(); | ||
| 323 | |||
| 324 | /* | ||
| 325 | * So now all preempt/rcu users must observe !cpu_active(). | ||
| 326 | */ | ||
| 327 | |||
| 311 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); | 328 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); |
| 312 | if (err) { | 329 | if (err) { |
| 313 | /* CPU didn't die: tell everyone. Can't complain. */ | 330 | /* CPU didn't die: tell everyone. Can't complain. */ |
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index e695c0a0bcb5..988573a9a387 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c | |||
| @@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void) | |||
| 44 | rcu_idle_enter(); | 44 | rcu_idle_enter(); |
| 45 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | 45 | trace_cpu_idle_rcuidle(0, smp_processor_id()); |
| 46 | local_irq_enable(); | 46 | local_irq_enable(); |
| 47 | while (!need_resched()) | 47 | while (!tif_need_resched()) |
| 48 | cpu_relax(); | 48 | cpu_relax(); |
| 49 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); | 49 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
| 50 | rcu_idle_exit(); | 50 | rcu_idle_exit(); |
| @@ -92,8 +92,7 @@ static void cpu_idle_loop(void) | |||
| 92 | if (cpu_idle_force_poll || tick_check_broadcast_expired()) { | 92 | if (cpu_idle_force_poll || tick_check_broadcast_expired()) { |
| 93 | cpu_idle_poll(); | 93 | cpu_idle_poll(); |
| 94 | } else { | 94 | } else { |
| 95 | current_clr_polling(); | 95 | if (!current_clr_polling_and_test()) { |
| 96 | if (!need_resched()) { | ||
| 97 | stop_critical_timings(); | 96 | stop_critical_timings(); |
| 98 | rcu_idle_enter(); | 97 | rcu_idle_enter(); |
| 99 | arch_cpu_idle(); | 98 | arch_cpu_idle(); |
| @@ -103,9 +102,16 @@ static void cpu_idle_loop(void) | |||
| 103 | } else { | 102 | } else { |
| 104 | local_irq_enable(); | 103 | local_irq_enable(); |
| 105 | } | 104 | } |
| 106 | current_set_polling(); | 105 | __current_set_polling(); |
| 107 | } | 106 | } |
| 108 | arch_cpu_idle_exit(); | 107 | arch_cpu_idle_exit(); |
| 108 | /* | ||
| 109 | * We need to test and propagate the TIF_NEED_RESCHED | ||
| 110 | * bit here because we might not have send the | ||
| 111 | * reschedule IPI to idle tasks. | ||
| 112 | */ | ||
| 113 | if (tif_need_resched()) | ||
| 114 | set_preempt_need_resched(); | ||
| 109 | } | 115 | } |
| 110 | tick_nohz_idle_exit(); | 116 | tick_nohz_idle_exit(); |
| 111 | schedule_preempt_disabled(); | 117 | schedule_preempt_disabled(); |
| @@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state) | |||
| 129 | */ | 135 | */ |
| 130 | boot_init_stack_canary(); | 136 | boot_init_stack_canary(); |
| 131 | #endif | 137 | #endif |
| 132 | current_set_polling(); | 138 | __current_set_polling(); |
| 133 | arch_cpu_idle_prepare(); | 139 | arch_cpu_idle_prepare(); |
| 134 | cpu_idle_loop(); | 140 | cpu_idle_loop(); |
| 135 | } | 141 | } |
diff --git a/kernel/events/core.c b/kernel/events/core.c index d49a9d29334c..8c875ef6e120 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; | |||
| 175 | static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); | 175 | static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); |
| 176 | static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; | 176 | static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; |
| 177 | 177 | ||
| 178 | static atomic_t perf_sample_allowed_ns __read_mostly = | 178 | static int perf_sample_allowed_ns __read_mostly = |
| 179 | ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); | 179 | DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; |
| 180 | 180 | ||
| 181 | void update_perf_cpu_limits(void) | 181 | void update_perf_cpu_limits(void) |
| 182 | { | 182 | { |
| @@ -184,7 +184,7 @@ void update_perf_cpu_limits(void) | |||
| 184 | 184 | ||
| 185 | tmp *= sysctl_perf_cpu_time_max_percent; | 185 | tmp *= sysctl_perf_cpu_time_max_percent; |
| 186 | do_div(tmp, 100); | 186 | do_div(tmp, 100); |
| 187 | atomic_set(&perf_sample_allowed_ns, tmp); | 187 | ACCESS_ONCE(perf_sample_allowed_ns) = tmp; |
| 188 | } | 188 | } |
| 189 | 189 | ||
| 190 | static int perf_rotate_context(struct perf_cpu_context *cpuctx); | 190 | static int perf_rotate_context(struct perf_cpu_context *cpuctx); |
| @@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, | |||
| 193 | void __user *buffer, size_t *lenp, | 193 | void __user *buffer, size_t *lenp, |
| 194 | loff_t *ppos) | 194 | loff_t *ppos) |
| 195 | { | 195 | { |
| 196 | int ret = proc_dointvec(table, write, buffer, lenp, ppos); | 196 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
| 197 | 197 | ||
| 198 | if (ret || !write) | 198 | if (ret || !write) |
| 199 | return ret; | 199 | return ret; |
| @@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, | |||
| 228 | * we detect that events are taking too long. | 228 | * we detect that events are taking too long. |
| 229 | */ | 229 | */ |
| 230 | #define NR_ACCUMULATED_SAMPLES 128 | 230 | #define NR_ACCUMULATED_SAMPLES 128 |
| 231 | DEFINE_PER_CPU(u64, running_sample_length); | 231 | static DEFINE_PER_CPU(u64, running_sample_length); |
| 232 | 232 | ||
| 233 | void perf_sample_event_took(u64 sample_len_ns) | 233 | void perf_sample_event_took(u64 sample_len_ns) |
| 234 | { | 234 | { |
| 235 | u64 avg_local_sample_len; | 235 | u64 avg_local_sample_len; |
| 236 | u64 local_samples_len; | 236 | u64 local_samples_len; |
| 237 | u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); | ||
| 237 | 238 | ||
| 238 | if (atomic_read(&perf_sample_allowed_ns) == 0) | 239 | if (allowed_ns == 0) |
| 239 | return; | 240 | return; |
| 240 | 241 | ||
| 241 | /* decay the counter by 1 average sample */ | 242 | /* decay the counter by 1 average sample */ |
| @@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns) | |||
| 251 | */ | 252 | */ |
| 252 | avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; | 253 | avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; |
| 253 | 254 | ||
| 254 | if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) | 255 | if (avg_local_sample_len <= allowed_ns) |
| 255 | return; | 256 | return; |
| 256 | 257 | ||
| 257 | if (max_samples_per_tick <= 1) | 258 | if (max_samples_per_tick <= 1) |
| @@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns) | |||
| 262 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; | 263 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; |
| 263 | 264 | ||
| 264 | printk_ratelimited(KERN_WARNING | 265 | printk_ratelimited(KERN_WARNING |
| 265 | "perf samples too long (%lld > %d), lowering " | 266 | "perf samples too long (%lld > %lld), lowering " |
| 266 | "kernel.perf_event_max_sample_rate to %d\n", | 267 | "kernel.perf_event_max_sample_rate to %d\n", |
| 267 | avg_local_sample_len, | 268 | avg_local_sample_len, allowed_ns, |
| 268 | atomic_read(&perf_sample_allowed_ns), | ||
| 269 | sysctl_perf_event_sample_rate); | 269 | sysctl_perf_event_sample_rate); |
| 270 | 270 | ||
| 271 | update_perf_cpu_limits(); | 271 | update_perf_cpu_limits(); |
| @@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx) | |||
| 899 | put_ctx(ctx->parent_ctx); | 899 | put_ctx(ctx->parent_ctx); |
| 900 | ctx->parent_ctx = NULL; | 900 | ctx->parent_ctx = NULL; |
| 901 | } | 901 | } |
| 902 | ctx->generation++; | ||
| 902 | } | 903 | } |
| 903 | 904 | ||
| 904 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | 905 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) |
| @@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 1136 | ctx->nr_events++; | 1137 | ctx->nr_events++; |
| 1137 | if (event->attr.inherit_stat) | 1138 | if (event->attr.inherit_stat) |
| 1138 | ctx->nr_stat++; | 1139 | ctx->nr_stat++; |
| 1140 | |||
| 1141 | ctx->generation++; | ||
| 1139 | } | 1142 | } |
| 1140 | 1143 | ||
| 1141 | /* | 1144 | /* |
| @@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event) | |||
| 1201 | if (sample_type & PERF_SAMPLE_DATA_SRC) | 1204 | if (sample_type & PERF_SAMPLE_DATA_SRC) |
| 1202 | size += sizeof(data->data_src.val); | 1205 | size += sizeof(data->data_src.val); |
| 1203 | 1206 | ||
| 1207 | if (sample_type & PERF_SAMPLE_TRANSACTION) | ||
| 1208 | size += sizeof(data->txn); | ||
| 1209 | |||
| 1204 | event->header_size = size; | 1210 | event->header_size = size; |
| 1205 | } | 1211 | } |
| 1206 | 1212 | ||
| @@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 1310 | */ | 1316 | */ |
| 1311 | if (event->state > PERF_EVENT_STATE_OFF) | 1317 | if (event->state > PERF_EVENT_STATE_OFF) |
| 1312 | event->state = PERF_EVENT_STATE_OFF; | 1318 | event->state = PERF_EVENT_STATE_OFF; |
| 1319 | |||
| 1320 | ctx->generation++; | ||
| 1313 | } | 1321 | } |
| 1314 | 1322 | ||
| 1315 | static void perf_group_detach(struct perf_event *event) | 1323 | static void perf_group_detach(struct perf_event *event) |
| @@ -2146,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
| 2146 | } | 2154 | } |
| 2147 | 2155 | ||
| 2148 | /* | 2156 | /* |
| 2149 | * Test whether two contexts are equivalent, i.e. whether they | 2157 | * Test whether two contexts are equivalent, i.e. whether they have both been |
| 2150 | * have both been cloned from the same version of the same context | 2158 | * cloned from the same version of the same context. |
| 2151 | * and they both have the same number of enabled events. | 2159 | * |
| 2152 | * If the number of enabled events is the same, then the set | 2160 | * Equivalence is measured using a generation number in the context that is |
| 2153 | * of enabled events should be the same, because these are both | 2161 | * incremented on each modification to it; see unclone_ctx(), list_add_event() |
| 2154 | * inherited contexts, therefore we can't access individual events | 2162 | * and list_del_event(). |
| 2155 | * in them directly with an fd; we can only enable/disable all | ||
| 2156 | * events via prctl, or enable/disable all events in a family | ||
| 2157 | * via ioctl, which will have the same effect on both contexts. | ||
| 2158 | */ | 2163 | */ |
| 2159 | static int context_equiv(struct perf_event_context *ctx1, | 2164 | static int context_equiv(struct perf_event_context *ctx1, |
| 2160 | struct perf_event_context *ctx2) | 2165 | struct perf_event_context *ctx2) |
| 2161 | { | 2166 | { |
| 2162 | return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx | 2167 | /* Pinning disables the swap optimization */ |
| 2163 | && ctx1->parent_gen == ctx2->parent_gen | 2168 | if (ctx1->pin_count || ctx2->pin_count) |
| 2164 | && !ctx1->pin_count && !ctx2->pin_count; | 2169 | return 0; |
| 2170 | |||
| 2171 | /* If ctx1 is the parent of ctx2 */ | ||
| 2172 | if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) | ||
| 2173 | return 1; | ||
| 2174 | |||
| 2175 | /* If ctx2 is the parent of ctx1 */ | ||
| 2176 | if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) | ||
| 2177 | return 1; | ||
| 2178 | |||
| 2179 | /* | ||
| 2180 | * If ctx1 and ctx2 have the same parent; we flatten the parent | ||
| 2181 | * hierarchy, see perf_event_init_context(). | ||
| 2182 | */ | ||
| 2183 | if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && | ||
| 2184 | ctx1->parent_gen == ctx2->parent_gen) | ||
| 2185 | return 1; | ||
| 2186 | |||
| 2187 | /* Unmatched */ | ||
| 2188 | return 0; | ||
| 2165 | } | 2189 | } |
| 2166 | 2190 | ||
| 2167 | static void __perf_event_sync_stat(struct perf_event *event, | 2191 | static void __perf_event_sync_stat(struct perf_event *event, |
| @@ -2244,7 +2268,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
| 2244 | { | 2268 | { |
| 2245 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; | 2269 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; |
| 2246 | struct perf_event_context *next_ctx; | 2270 | struct perf_event_context *next_ctx; |
| 2247 | struct perf_event_context *parent; | 2271 | struct perf_event_context *parent, *next_parent; |
| 2248 | struct perf_cpu_context *cpuctx; | 2272 | struct perf_cpu_context *cpuctx; |
| 2249 | int do_switch = 1; | 2273 | int do_switch = 1; |
| 2250 | 2274 | ||
| @@ -2256,10 +2280,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
| 2256 | return; | 2280 | return; |
| 2257 | 2281 | ||
| 2258 | rcu_read_lock(); | 2282 | rcu_read_lock(); |
| 2259 | parent = rcu_dereference(ctx->parent_ctx); | ||
| 2260 | next_ctx = next->perf_event_ctxp[ctxn]; | 2283 | next_ctx = next->perf_event_ctxp[ctxn]; |
| 2261 | if (parent && next_ctx && | 2284 | if (!next_ctx) |
| 2262 | rcu_dereference(next_ctx->parent_ctx) == parent) { | 2285 | goto unlock; |
| 2286 | |||
| 2287 | parent = rcu_dereference(ctx->parent_ctx); | ||
| 2288 | next_parent = rcu_dereference(next_ctx->parent_ctx); | ||
| 2289 | |||
| 2290 | /* If neither context have a parent context; they cannot be clones. */ | ||
| 2291 | if (!parent && !next_parent) | ||
| 2292 | goto unlock; | ||
| 2293 | |||
| 2294 | if (next_parent == ctx || next_ctx == parent || next_parent == parent) { | ||
| 2263 | /* | 2295 | /* |
| 2264 | * Looks like the two contexts are clones, so we might be | 2296 | * Looks like the two contexts are clones, so we might be |
| 2265 | * able to optimize the context switch. We lock both | 2297 | * able to optimize the context switch. We lock both |
| @@ -2287,6 +2319,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
| 2287 | raw_spin_unlock(&next_ctx->lock); | 2319 | raw_spin_unlock(&next_ctx->lock); |
| 2288 | raw_spin_unlock(&ctx->lock); | 2320 | raw_spin_unlock(&ctx->lock); |
| 2289 | } | 2321 | } |
| 2322 | unlock: | ||
| 2290 | rcu_read_unlock(); | 2323 | rcu_read_unlock(); |
| 2291 | 2324 | ||
| 2292 | if (do_switch) { | 2325 | if (do_switch) { |
| @@ -4572,6 +4605,9 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
| 4572 | if (sample_type & PERF_SAMPLE_DATA_SRC) | 4605 | if (sample_type & PERF_SAMPLE_DATA_SRC) |
| 4573 | perf_output_put(handle, data->data_src.val); | 4606 | perf_output_put(handle, data->data_src.val); |
| 4574 | 4607 | ||
| 4608 | if (sample_type & PERF_SAMPLE_TRANSACTION) | ||
| 4609 | perf_output_put(handle, data->txn); | ||
| 4610 | |||
| 4575 | if (!event->attr.watermark) { | 4611 | if (!event->attr.watermark) { |
| 4576 | int wakeup_events = event->attr.wakeup_events; | 4612 | int wakeup_events = event->attr.wakeup_events; |
| 4577 | 4613 | ||
| @@ -5100,27 +5136,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
| 5100 | unsigned int size; | 5136 | unsigned int size; |
| 5101 | char tmp[16]; | 5137 | char tmp[16]; |
| 5102 | char *buf = NULL; | 5138 | char *buf = NULL; |
| 5103 | const char *name; | 5139 | char *name; |
| 5104 | |||
| 5105 | memset(tmp, 0, sizeof(tmp)); | ||
| 5106 | 5140 | ||
| 5107 | if (file) { | 5141 | if (file) { |
| 5108 | struct inode *inode; | 5142 | struct inode *inode; |
| 5109 | dev_t dev; | 5143 | dev_t dev; |
| 5144 | |||
| 5145 | buf = kmalloc(PATH_MAX, GFP_KERNEL); | ||
| 5146 | if (!buf) { | ||
| 5147 | name = "//enomem"; | ||
| 5148 | goto cpy_name; | ||
| 5149 | } | ||
| 5110 | /* | 5150 | /* |
| 5111 | * d_path works from the end of the rb backwards, so we | 5151 | * d_path() works from the end of the rb backwards, so we |
| 5112 | * need to add enough zero bytes after the string to handle | 5152 | * need to add enough zero bytes after the string to handle |
| 5113 | * the 64bit alignment we do later. | 5153 | * the 64bit alignment we do later. |
| 5114 | */ | 5154 | */ |
| 5115 | buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); | 5155 | name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); |
| 5116 | if (!buf) { | ||
| 5117 | name = strncpy(tmp, "//enomem", sizeof(tmp)); | ||
| 5118 | goto got_name; | ||
| 5119 | } | ||
| 5120 | name = d_path(&file->f_path, buf, PATH_MAX); | ||
| 5121 | if (IS_ERR(name)) { | 5156 | if (IS_ERR(name)) { |
| 5122 | name = strncpy(tmp, "//toolong", sizeof(tmp)); | 5157 | name = "//toolong"; |
| 5123 | goto got_name; | 5158 | goto cpy_name; |
| 5124 | } | 5159 | } |
| 5125 | inode = file_inode(vma->vm_file); | 5160 | inode = file_inode(vma->vm_file); |
| 5126 | dev = inode->i_sb->s_dev; | 5161 | dev = inode->i_sb->s_dev; |
| @@ -5128,34 +5163,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
| 5128 | gen = inode->i_generation; | 5163 | gen = inode->i_generation; |
| 5129 | maj = MAJOR(dev); | 5164 | maj = MAJOR(dev); |
| 5130 | min = MINOR(dev); | 5165 | min = MINOR(dev); |
| 5131 | 5166 | goto got_name; | |
| 5132 | } else { | 5167 | } else { |
| 5133 | if (arch_vma_name(mmap_event->vma)) { | 5168 | name = (char *)arch_vma_name(vma); |
| 5134 | name = strncpy(tmp, arch_vma_name(mmap_event->vma), | 5169 | if (name) |
| 5135 | sizeof(tmp) - 1); | 5170 | goto cpy_name; |
| 5136 | tmp[sizeof(tmp) - 1] = '\0'; | ||
| 5137 | goto got_name; | ||
| 5138 | } | ||
| 5139 | 5171 | ||
| 5140 | if (!vma->vm_mm) { | 5172 | if (vma->vm_start <= vma->vm_mm->start_brk && |
| 5141 | name = strncpy(tmp, "[vdso]", sizeof(tmp)); | ||
| 5142 | goto got_name; | ||
| 5143 | } else if (vma->vm_start <= vma->vm_mm->start_brk && | ||
| 5144 | vma->vm_end >= vma->vm_mm->brk) { | 5173 | vma->vm_end >= vma->vm_mm->brk) { |
| 5145 | name = strncpy(tmp, "[heap]", sizeof(tmp)); | 5174 | name = "[heap]"; |
| 5146 | goto got_name; | 5175 | goto cpy_name; |
| 5147 | } else if (vma->vm_start <= vma->vm_mm->start_stack && | 5176 | } |
| 5177 | if (vma->vm_start <= vma->vm_mm->start_stack && | ||
| 5148 | vma->vm_end >= vma->vm_mm->start_stack) { | 5178 | vma->vm_end >= vma->vm_mm->start_stack) { |
| 5149 | name = strncpy(tmp, "[stack]", sizeof(tmp)); | 5179 | name = "[stack]"; |
| 5150 | goto got_name; | 5180 | goto cpy_name; |
| 5151 | } | 5181 | } |
| 5152 | 5182 | ||
| 5153 | name = strncpy(tmp, "//anon", sizeof(tmp)); | 5183 | name = "//anon"; |
| 5154 | goto got_name; | 5184 | goto cpy_name; |
| 5155 | } | 5185 | } |
| 5156 | 5186 | ||
| 5187 | cpy_name: | ||
| 5188 | strlcpy(tmp, name, sizeof(tmp)); | ||
| 5189 | name = tmp; | ||
| 5157 | got_name: | 5190 | got_name: |
| 5158 | size = ALIGN(strlen(name)+1, sizeof(u64)); | 5191 | /* |
| 5192 | * Since our buffer works in 8 byte units we need to align our string | ||
| 5193 | * size to a multiple of 8. However, we must guarantee the tail end is | ||
| 5194 | * zero'd out to avoid leaking random bits to userspace. | ||
| 5195 | */ | ||
| 5196 | size = strlen(name)+1; | ||
| 5197 | while (!IS_ALIGNED(size, sizeof(u64))) | ||
| 5198 | name[size++] = '\0'; | ||
| 5159 | 5199 | ||
| 5160 | mmap_event->file_name = name; | 5200 | mmap_event->file_name = name; |
| 5161 | mmap_event->file_size = size; | 5201 | mmap_event->file_size = size; |
| @@ -6292,6 +6332,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) | |||
| 6292 | 6332 | ||
| 6293 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); | 6333 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); |
| 6294 | } | 6334 | } |
| 6335 | static DEVICE_ATTR_RO(type); | ||
| 6295 | 6336 | ||
| 6296 | static ssize_t | 6337 | static ssize_t |
| 6297 | perf_event_mux_interval_ms_show(struct device *dev, | 6338 | perf_event_mux_interval_ms_show(struct device *dev, |
| @@ -6336,17 +6377,19 @@ perf_event_mux_interval_ms_store(struct device *dev, | |||
| 6336 | 6377 | ||
| 6337 | return count; | 6378 | return count; |
| 6338 | } | 6379 | } |
| 6380 | static DEVICE_ATTR_RW(perf_event_mux_interval_ms); | ||
| 6339 | 6381 | ||
| 6340 | static struct device_attribute pmu_dev_attrs[] = { | 6382 | static struct attribute *pmu_dev_attrs[] = { |
| 6341 | __ATTR_RO(type), | 6383 | &dev_attr_type.attr, |
| 6342 | __ATTR_RW(perf_event_mux_interval_ms), | 6384 | &dev_attr_perf_event_mux_interval_ms.attr, |
| 6343 | __ATTR_NULL, | 6385 | NULL, |
| 6344 | }; | 6386 | }; |
| 6387 | ATTRIBUTE_GROUPS(pmu_dev); | ||
| 6345 | 6388 | ||
| 6346 | static int pmu_bus_running; | 6389 | static int pmu_bus_running; |
| 6347 | static struct bus_type pmu_bus = { | 6390 | static struct bus_type pmu_bus = { |
| 6348 | .name = "event_source", | 6391 | .name = "event_source", |
| 6349 | .dev_attrs = pmu_dev_attrs, | 6392 | .dev_groups = pmu_dev_groups, |
| 6350 | }; | 6393 | }; |
| 6351 | 6394 | ||
| 6352 | static void pmu_dev_release(struct device *dev) | 6395 | static void pmu_dev_release(struct device *dev) |
| @@ -6767,6 +6810,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
| 6767 | if (ret) | 6810 | if (ret) |
| 6768 | return -EFAULT; | 6811 | return -EFAULT; |
| 6769 | 6812 | ||
| 6813 | /* disabled for now */ | ||
| 6814 | if (attr->mmap2) | ||
| 6815 | return -EINVAL; | ||
| 6816 | |||
| 6770 | if (attr->__reserved_1) | 6817 | if (attr->__reserved_1) |
| 6771 | return -EINVAL; | 6818 | return -EINVAL; |
| 6772 | 6819 | ||
| @@ -7122,7 +7169,6 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 7122 | } | 7169 | } |
| 7123 | 7170 | ||
| 7124 | perf_install_in_context(ctx, event, event->cpu); | 7171 | perf_install_in_context(ctx, event, event->cpu); |
| 7125 | ++ctx->generation; | ||
| 7126 | perf_unpin_context(ctx); | 7172 | perf_unpin_context(ctx); |
| 7127 | mutex_unlock(&ctx->mutex); | 7173 | mutex_unlock(&ctx->mutex); |
| 7128 | 7174 | ||
| @@ -7205,7 +7251,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
| 7205 | WARN_ON_ONCE(ctx->parent_ctx); | 7251 | WARN_ON_ONCE(ctx->parent_ctx); |
| 7206 | mutex_lock(&ctx->mutex); | 7252 | mutex_lock(&ctx->mutex); |
| 7207 | perf_install_in_context(ctx, event, cpu); | 7253 | perf_install_in_context(ctx, event, cpu); |
| 7208 | ++ctx->generation; | ||
| 7209 | perf_unpin_context(ctx); | 7254 | perf_unpin_context(ctx); |
| 7210 | mutex_unlock(&ctx->mutex); | 7255 | mutex_unlock(&ctx->mutex); |
| 7211 | 7256 | ||
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index ca6599723be5..569b218782ad 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
| @@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) | |||
| 82 | } | 82 | } |
| 83 | 83 | ||
| 84 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ | 84 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ |
| 85 | static inline unsigned int \ | 85 | static inline unsigned long \ |
| 86 | func_name(struct perf_output_handle *handle, \ | 86 | func_name(struct perf_output_handle *handle, \ |
| 87 | const void *buf, unsigned int len) \ | 87 | const void *buf, unsigned long len) \ |
| 88 | { \ | 88 | { \ |
| 89 | unsigned long size, written; \ | 89 | unsigned long size, written; \ |
| 90 | \ | 90 | \ |
| 91 | do { \ | 91 | do { \ |
| 92 | size = min_t(unsigned long, handle->size, len); \ | 92 | size = min(handle->size, len); \ |
| 93 | \ | ||
| 94 | written = memcpy_func(handle->addr, buf, size); \ | 93 | written = memcpy_func(handle->addr, buf, size); \ |
| 94 | written = size - written; \ | ||
| 95 | \ | 95 | \ |
| 96 | len -= written; \ | 96 | len -= written; \ |
| 97 | handle->addr += written; \ | 97 | handle->addr += written; \ |
| @@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle, \ | |||
| 110 | return len; \ | 110 | return len; \ |
| 111 | } | 111 | } |
| 112 | 112 | ||
| 113 | static inline int memcpy_common(void *dst, const void *src, size_t n) | 113 | static inline unsigned long |
| 114 | memcpy_common(void *dst, const void *src, unsigned long n) | ||
| 114 | { | 115 | { |
| 115 | memcpy(dst, src, n); | 116 | memcpy(dst, src, n); |
| 116 | return n; | 117 | return 0; |
| 117 | } | 118 | } |
| 118 | 119 | ||
| 119 | DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) | 120 | DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) |
| 120 | 121 | ||
| 121 | #define MEMCPY_SKIP(dst, src, n) (n) | 122 | static inline unsigned long |
| 123 | memcpy_skip(void *dst, const void *src, unsigned long n) | ||
| 124 | { | ||
| 125 | return 0; | ||
| 126 | } | ||
| 122 | 127 | ||
| 123 | DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) | 128 | DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip) |
| 124 | 129 | ||
| 125 | #ifndef arch_perf_out_copy_user | 130 | #ifndef arch_perf_out_copy_user |
| 126 | #define arch_perf_out_copy_user __copy_from_user_inatomic | 131 | #define arch_perf_out_copy_user arch_perf_out_copy_user |
| 132 | |||
| 133 | static inline unsigned long | ||
| 134 | arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) | ||
| 135 | { | ||
| 136 | unsigned long ret; | ||
| 137 | |||
| 138 | pagefault_disable(); | ||
| 139 | ret = __copy_from_user_inatomic(dst, src, n); | ||
| 140 | pagefault_enable(); | ||
| 141 | |||
| 142 | return ret; | ||
| 143 | } | ||
| 127 | #endif | 144 | #endif |
| 128 | 145 | ||
| 129 | DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) | 146 | DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index cd55144270b5..e8b168af135b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
| @@ -12,40 +12,10 @@ | |||
| 12 | #include <linux/perf_event.h> | 12 | #include <linux/perf_event.h> |
| 13 | #include <linux/vmalloc.h> | 13 | #include <linux/vmalloc.h> |
| 14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
| 15 | #include <linux/circ_buf.h> | ||
| 15 | 16 | ||
| 16 | #include "internal.h" | 17 | #include "internal.h" |
| 17 | 18 | ||
| 18 | static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, | ||
| 19 | unsigned long offset, unsigned long head) | ||
| 20 | { | ||
| 21 | unsigned long sz = perf_data_size(rb); | ||
| 22 | unsigned long mask = sz - 1; | ||
| 23 | |||
| 24 | /* | ||
| 25 | * check if user-writable | ||
| 26 | * overwrite : over-write its own tail | ||
| 27 | * !overwrite: buffer possibly drops events. | ||
| 28 | */ | ||
| 29 | if (rb->overwrite) | ||
| 30 | return true; | ||
| 31 | |||
| 32 | /* | ||
| 33 | * verify that payload is not bigger than buffer | ||
| 34 | * otherwise masking logic may fail to detect | ||
| 35 | * the "not enough space" condition | ||
| 36 | */ | ||
| 37 | if ((head - offset) > sz) | ||
| 38 | return false; | ||
| 39 | |||
| 40 | offset = (offset - tail) & mask; | ||
| 41 | head = (head - tail) & mask; | ||
| 42 | |||
| 43 | if ((int)(head - offset) < 0) | ||
| 44 | return false; | ||
| 45 | |||
| 46 | return true; | ||
| 47 | } | ||
| 48 | |||
| 49 | static void perf_output_wakeup(struct perf_output_handle *handle) | 19 | static void perf_output_wakeup(struct perf_output_handle *handle) |
| 50 | { | 20 | { |
| 51 | atomic_set(&handle->rb->poll, POLL_IN); | 21 | atomic_set(&handle->rb->poll, POLL_IN); |
| @@ -87,15 +57,36 @@ again: | |||
| 87 | goto out; | 57 | goto out; |
| 88 | 58 | ||
| 89 | /* | 59 | /* |
| 90 | * Publish the known good head. Rely on the full barrier implied | 60 | * Since the mmap() consumer (userspace) can run on a different CPU: |
| 91 | * by atomic_dec_and_test() order the rb->head read and this | 61 | * |
| 92 | * write. | 62 | * kernel user |
| 63 | * | ||
| 64 | * READ ->data_tail READ ->data_head | ||
| 65 | * smp_mb() (A) smp_rmb() (C) | ||
| 66 | * WRITE $data READ $data | ||
| 67 | * smp_wmb() (B) smp_mb() (D) | ||
| 68 | * STORE ->data_head WRITE ->data_tail | ||
| 69 | * | ||
| 70 | * Where A pairs with D, and B pairs with C. | ||
| 71 | * | ||
| 72 | * I don't think A needs to be a full barrier because we won't in fact | ||
| 73 | * write data until we see the store from userspace. So we simply don't | ||
| 74 | * issue the data WRITE until we observe it. Be conservative for now. | ||
| 75 | * | ||
| 76 | * OTOH, D needs to be a full barrier since it separates the data READ | ||
| 77 | * from the tail WRITE. | ||
| 78 | * | ||
| 79 | * For B a WMB is sufficient since it separates two WRITEs, and for C | ||
| 80 | * an RMB is sufficient since it separates two READs. | ||
| 81 | * | ||
| 82 | * See perf_output_begin(). | ||
| 93 | */ | 83 | */ |
| 84 | smp_wmb(); | ||
| 94 | rb->user_page->data_head = head; | 85 | rb->user_page->data_head = head; |
| 95 | 86 | ||
| 96 | /* | 87 | /* |
| 97 | * Now check if we missed an update, rely on the (compiler) | 88 | * Now check if we missed an update -- rely on previous implied |
| 98 | * barrier in atomic_dec_and_test() to re-read rb->head. | 89 | * compiler barriers to force a re-read. |
| 99 | */ | 90 | */ |
| 100 | if (unlikely(head != local_read(&rb->head))) { | 91 | if (unlikely(head != local_read(&rb->head))) { |
| 101 | local_inc(&rb->nest); | 92 | local_inc(&rb->nest); |
| @@ -114,8 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
| 114 | { | 105 | { |
| 115 | struct ring_buffer *rb; | 106 | struct ring_buffer *rb; |
| 116 | unsigned long tail, offset, head; | 107 | unsigned long tail, offset, head; |
| 117 | int have_lost; | 108 | int have_lost, page_shift; |
| 118 | struct perf_sample_data sample_data; | ||
| 119 | struct { | 109 | struct { |
| 120 | struct perf_event_header header; | 110 | struct perf_event_header header; |
| 121 | u64 id; | 111 | u64 id; |
| @@ -130,55 +120,63 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
| 130 | event = event->parent; | 120 | event = event->parent; |
| 131 | 121 | ||
| 132 | rb = rcu_dereference(event->rb); | 122 | rb = rcu_dereference(event->rb); |
| 133 | if (!rb) | 123 | if (unlikely(!rb)) |
| 134 | goto out; | 124 | goto out; |
| 135 | 125 | ||
| 136 | handle->rb = rb; | 126 | if (unlikely(!rb->nr_pages)) |
| 137 | handle->event = event; | ||
| 138 | |||
| 139 | if (!rb->nr_pages) | ||
| 140 | goto out; | 127 | goto out; |
| 141 | 128 | ||
| 129 | handle->rb = rb; | ||
| 130 | handle->event = event; | ||
| 131 | |||
| 142 | have_lost = local_read(&rb->lost); | 132 | have_lost = local_read(&rb->lost); |
| 143 | if (have_lost) { | 133 | if (unlikely(have_lost)) { |
| 144 | lost_event.header.size = sizeof(lost_event); | 134 | size += sizeof(lost_event); |
| 145 | perf_event_header__init_id(&lost_event.header, &sample_data, | 135 | if (event->attr.sample_id_all) |
| 146 | event); | 136 | size += event->id_header_size; |
| 147 | size += lost_event.header.size; | ||
| 148 | } | 137 | } |
| 149 | 138 | ||
| 150 | perf_output_get_handle(handle); | 139 | perf_output_get_handle(handle); |
| 151 | 140 | ||
| 152 | do { | 141 | do { |
| 153 | /* | ||
| 154 | * Userspace could choose to issue a mb() before updating the | ||
| 155 | * tail pointer. So that all reads will be completed before the | ||
| 156 | * write is issued. | ||
| 157 | */ | ||
| 158 | tail = ACCESS_ONCE(rb->user_page->data_tail); | 142 | tail = ACCESS_ONCE(rb->user_page->data_tail); |
| 159 | smp_rmb(); | ||
| 160 | offset = head = local_read(&rb->head); | 143 | offset = head = local_read(&rb->head); |
| 161 | head += size; | 144 | if (!rb->overwrite && |
| 162 | if (unlikely(!perf_output_space(rb, tail, offset, head))) | 145 | unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) |
| 163 | goto fail; | 146 | goto fail; |
| 147 | head += size; | ||
| 164 | } while (local_cmpxchg(&rb->head, offset, head) != offset); | 148 | } while (local_cmpxchg(&rb->head, offset, head) != offset); |
| 165 | 149 | ||
| 166 | if (head - local_read(&rb->wakeup) > rb->watermark) | 150 | /* |
| 151 | * Separate the userpage->tail read from the data stores below. | ||
| 152 | * Matches the MB userspace SHOULD issue after reading the data | ||
| 153 | * and before storing the new tail position. | ||
| 154 | * | ||
| 155 | * See perf_output_put_handle(). | ||
| 156 | */ | ||
| 157 | smp_mb(); | ||
| 158 | |||
| 159 | if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) | ||
| 167 | local_add(rb->watermark, &rb->wakeup); | 160 | local_add(rb->watermark, &rb->wakeup); |
| 168 | 161 | ||
| 169 | handle->page = offset >> (PAGE_SHIFT + page_order(rb)); | 162 | page_shift = PAGE_SHIFT + page_order(rb); |
| 170 | handle->page &= rb->nr_pages - 1; | ||
| 171 | handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); | ||
| 172 | handle->addr = rb->data_pages[handle->page]; | ||
| 173 | handle->addr += handle->size; | ||
| 174 | handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; | ||
| 175 | 163 | ||
| 176 | if (have_lost) { | 164 | handle->page = (offset >> page_shift) & (rb->nr_pages - 1); |
| 165 | offset &= (1UL << page_shift) - 1; | ||
| 166 | handle->addr = rb->data_pages[handle->page] + offset; | ||
| 167 | handle->size = (1UL << page_shift) - offset; | ||
| 168 | |||
| 169 | if (unlikely(have_lost)) { | ||
| 170 | struct perf_sample_data sample_data; | ||
| 171 | |||
| 172 | lost_event.header.size = sizeof(lost_event); | ||
| 177 | lost_event.header.type = PERF_RECORD_LOST; | 173 | lost_event.header.type = PERF_RECORD_LOST; |
| 178 | lost_event.header.misc = 0; | 174 | lost_event.header.misc = 0; |
| 179 | lost_event.id = event->id; | 175 | lost_event.id = event->id; |
| 180 | lost_event.lost = local_xchg(&rb->lost, 0); | 176 | lost_event.lost = local_xchg(&rb->lost, 0); |
| 181 | 177 | ||
| 178 | perf_event_header__init_id(&lost_event.header, | ||
| 179 | &sample_data, event); | ||
| 182 | perf_output_put(handle, lost_event); | 180 | perf_output_put(handle, lost_event); |
| 183 | perf_event__output_id_sample(event, handle, &sample_data); | 181 | perf_event__output_id_sample(event, handle, &sample_data); |
| 184 | } | 182 | } |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ad8e1bdca70e..24b7d6ca871b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
| @@ -35,6 +35,7 @@ | |||
| 35 | #include <linux/kdebug.h> /* notifier mechanism */ | 35 | #include <linux/kdebug.h> /* notifier mechanism */ |
| 36 | #include "../../mm/internal.h" /* munlock_vma_page */ | 36 | #include "../../mm/internal.h" /* munlock_vma_page */ |
| 37 | #include <linux/percpu-rwsem.h> | 37 | #include <linux/percpu-rwsem.h> |
| 38 | #include <linux/task_work.h> | ||
| 38 | 39 | ||
| 39 | #include <linux/uprobes.h> | 40 | #include <linux/uprobes.h> |
| 40 | 41 | ||
| @@ -244,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t | |||
| 244 | * the architecture. If an arch has variable length instruction and the | 245 | * the architecture. If an arch has variable length instruction and the |
| 245 | * breakpoint instruction is not of the smallest length instruction | 246 | * breakpoint instruction is not of the smallest length instruction |
| 246 | * supported by that architecture then we need to modify is_trap_at_addr and | 247 | * supported by that architecture then we need to modify is_trap_at_addr and |
| 247 | * write_opcode accordingly. This would never be a problem for archs that | 248 | * uprobe_write_opcode accordingly. This would never be a problem for archs |
| 248 | * have fixed length instructions. | 249 | * that have fixed length instructions. |
| 249 | */ | 250 | */ |
| 250 | 251 | ||
| 251 | /* | 252 | /* |
| 252 | * write_opcode - write the opcode at a given virtual address. | 253 | * uprobe_write_opcode - write the opcode at a given virtual address. |
| 253 | * @mm: the probed process address space. | 254 | * @mm: the probed process address space. |
| 254 | * @vaddr: the virtual address to store the opcode. | 255 | * @vaddr: the virtual address to store the opcode. |
| 255 | * @opcode: opcode to be written at @vaddr. | 256 | * @opcode: opcode to be written at @vaddr. |
| @@ -260,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t | |||
| 260 | * For mm @mm, write the opcode at @vaddr. | 261 | * For mm @mm, write the opcode at @vaddr. |
| 261 | * Return 0 (success) or a negative errno. | 262 | * Return 0 (success) or a negative errno. |
| 262 | */ | 263 | */ |
| 263 | static int write_opcode(struct mm_struct *mm, unsigned long vaddr, | 264 | int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, |
| 264 | uprobe_opcode_t opcode) | 265 | uprobe_opcode_t opcode) |
| 265 | { | 266 | { |
| 266 | struct page *old_page, *new_page; | 267 | struct page *old_page, *new_page; |
| @@ -314,7 +315,7 @@ put_old: | |||
| 314 | */ | 315 | */ |
| 315 | int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) | 316 | int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) |
| 316 | { | 317 | { |
| 317 | return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); | 318 | return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN); |
| 318 | } | 319 | } |
| 319 | 320 | ||
| 320 | /** | 321 | /** |
| @@ -329,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned | |||
| 329 | int __weak | 330 | int __weak |
| 330 | set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) | 331 | set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) |
| 331 | { | 332 | { |
| 332 | return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); | 333 | return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); |
| 333 | } | 334 | } |
| 334 | 335 | ||
| 335 | static int match_uprobe(struct uprobe *l, struct uprobe *r) | 336 | static int match_uprobe(struct uprobe *l, struct uprobe *r) |
| @@ -503,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) | |||
| 503 | return ret; | 504 | return ret; |
| 504 | } | 505 | } |
| 505 | 506 | ||
| 506 | static int | 507 | static int __copy_insn(struct address_space *mapping, struct file *filp, |
| 507 | __copy_insn(struct address_space *mapping, struct file *filp, char *insn, | 508 | void *insn, int nbytes, loff_t offset) |
| 508 | unsigned long nbytes, loff_t offset) | ||
| 509 | { | 509 | { |
| 510 | struct page *page; | 510 | struct page *page; |
| 511 | 511 | ||
| @@ -527,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, | |||
| 527 | 527 | ||
| 528 | static int copy_insn(struct uprobe *uprobe, struct file *filp) | 528 | static int copy_insn(struct uprobe *uprobe, struct file *filp) |
| 529 | { | 529 | { |
| 530 | struct address_space *mapping; | 530 | struct address_space *mapping = uprobe->inode->i_mapping; |
| 531 | unsigned long nbytes; | 531 | loff_t offs = uprobe->offset; |
| 532 | int bytes; | 532 | void *insn = uprobe->arch.insn; |
| 533 | 533 | int size = MAX_UINSN_BYTES; | |
| 534 | nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); | 534 | int len, err = -EIO; |
| 535 | mapping = uprobe->inode->i_mapping; | ||
| 536 | 535 | ||
| 537 | /* Instruction at end of binary; copy only available bytes */ | 536 | /* Copy only available bytes, -EIO if nothing was read */ |
| 538 | if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) | 537 | do { |
| 539 | bytes = uprobe->inode->i_size - uprobe->offset; | 538 | if (offs >= i_size_read(uprobe->inode)) |
| 540 | else | 539 | break; |
| 541 | bytes = MAX_UINSN_BYTES; | ||
| 542 | 540 | ||
| 543 | /* Instruction at the page-boundary; copy bytes in second page */ | 541 | len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK)); |
| 544 | if (nbytes < bytes) { | 542 | err = __copy_insn(mapping, filp, insn, len, offs); |
| 545 | int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, | ||
| 546 | bytes - nbytes, uprobe->offset + nbytes); | ||
| 547 | if (err) | 543 | if (err) |
| 548 | return err; | 544 | break; |
| 549 | bytes = nbytes; | 545 | |
| 550 | } | 546 | insn += len; |
| 551 | return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); | 547 | offs += len; |
| 548 | size -= len; | ||
| 549 | } while (size); | ||
| 550 | |||
| 551 | return err; | ||
| 552 | } | 552 | } |
| 553 | 553 | ||
| 554 | static int prepare_uprobe(struct uprobe *uprobe, struct file *file, | 554 | static int prepare_uprobe(struct uprobe *uprobe, struct file *file, |
| @@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, | |||
| 576 | if (ret) | 576 | if (ret) |
| 577 | goto out; | 577 | goto out; |
| 578 | 578 | ||
| 579 | /* write_opcode() assumes we don't cross page boundary */ | 579 | /* uprobe_write_opcode() assumes we don't cross page boundary */ |
| 580 | BUG_ON((uprobe->offset & ~PAGE_MASK) + | 580 | BUG_ON((uprobe->offset & ~PAGE_MASK) + |
| 581 | UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); | 581 | UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); |
| 582 | 582 | ||
| @@ -1096,21 +1096,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon | |||
| 1096 | } | 1096 | } |
| 1097 | 1097 | ||
| 1098 | /* Slot allocation for XOL */ | 1098 | /* Slot allocation for XOL */ |
| 1099 | static int xol_add_vma(struct xol_area *area) | 1099 | static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) |
| 1100 | { | 1100 | { |
| 1101 | struct mm_struct *mm = current->mm; | ||
| 1102 | int ret = -EALREADY; | 1101 | int ret = -EALREADY; |
| 1103 | 1102 | ||
| 1104 | down_write(&mm->mmap_sem); | 1103 | down_write(&mm->mmap_sem); |
| 1105 | if (mm->uprobes_state.xol_area) | 1104 | if (mm->uprobes_state.xol_area) |
| 1106 | goto fail; | 1105 | goto fail; |
| 1107 | 1106 | ||
| 1108 | ret = -ENOMEM; | 1107 | if (!area->vaddr) { |
| 1109 | /* Try to map as high as possible, this is only a hint. */ | 1108 | /* Try to map as high as possible, this is only a hint. */ |
| 1110 | area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); | 1109 | area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, |
| 1111 | if (area->vaddr & ~PAGE_MASK) { | 1110 | PAGE_SIZE, 0, 0); |
| 1112 | ret = area->vaddr; | 1111 | if (area->vaddr & ~PAGE_MASK) { |
| 1113 | goto fail; | 1112 | ret = area->vaddr; |
| 1113 | goto fail; | ||
| 1114 | } | ||
| 1114 | } | 1115 | } |
| 1115 | 1116 | ||
| 1116 | ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, | 1117 | ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, |
| @@ -1120,30 +1121,19 @@ static int xol_add_vma(struct xol_area *area) | |||
| 1120 | 1121 | ||
| 1121 | smp_wmb(); /* pairs with get_xol_area() */ | 1122 | smp_wmb(); /* pairs with get_xol_area() */ |
| 1122 | mm->uprobes_state.xol_area = area; | 1123 | mm->uprobes_state.xol_area = area; |
| 1123 | ret = 0; | ||
| 1124 | fail: | 1124 | fail: |
| 1125 | up_write(&mm->mmap_sem); | 1125 | up_write(&mm->mmap_sem); |
| 1126 | 1126 | ||
| 1127 | return ret; | 1127 | return ret; |
| 1128 | } | 1128 | } |
| 1129 | 1129 | ||
| 1130 | /* | 1130 | static struct xol_area *__create_xol_area(unsigned long vaddr) |
| 1131 | * get_xol_area - Allocate process's xol_area if necessary. | ||
| 1132 | * This area will be used for storing instructions for execution out of line. | ||
| 1133 | * | ||
| 1134 | * Returns the allocated area or NULL. | ||
| 1135 | */ | ||
| 1136 | static struct xol_area *get_xol_area(void) | ||
| 1137 | { | 1131 | { |
| 1138 | struct mm_struct *mm = current->mm; | 1132 | struct mm_struct *mm = current->mm; |
| 1139 | struct xol_area *area; | ||
| 1140 | uprobe_opcode_t insn = UPROBE_SWBP_INSN; | 1133 | uprobe_opcode_t insn = UPROBE_SWBP_INSN; |
| 1134 | struct xol_area *area; | ||
| 1141 | 1135 | ||
| 1142 | area = mm->uprobes_state.xol_area; | 1136 | area = kmalloc(sizeof(*area), GFP_KERNEL); |
| 1143 | if (area) | ||
| 1144 | goto ret; | ||
| 1145 | |||
| 1146 | area = kzalloc(sizeof(*area), GFP_KERNEL); | ||
| 1147 | if (unlikely(!area)) | 1137 | if (unlikely(!area)) |
| 1148 | goto out; | 1138 | goto out; |
| 1149 | 1139 | ||
| @@ -1155,13 +1145,14 @@ static struct xol_area *get_xol_area(void) | |||
| 1155 | if (!area->page) | 1145 | if (!area->page) |
| 1156 | goto free_bitmap; | 1146 | goto free_bitmap; |
| 1157 | 1147 | ||
| 1158 | /* allocate first slot of task's xol_area for the return probes */ | 1148 | area->vaddr = vaddr; |
| 1149 | init_waitqueue_head(&area->wq); | ||
| 1150 | /* Reserve the 1st slot for get_trampoline_vaddr() */ | ||
| 1159 | set_bit(0, area->bitmap); | 1151 | set_bit(0, area->bitmap); |
| 1160 | copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); | ||
| 1161 | atomic_set(&area->slot_count, 1); | 1152 | atomic_set(&area->slot_count, 1); |
| 1162 | init_waitqueue_head(&area->wq); | 1153 | copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); |
| 1163 | 1154 | ||
| 1164 | if (!xol_add_vma(area)) | 1155 | if (!xol_add_vma(mm, area)) |
| 1165 | return area; | 1156 | return area; |
| 1166 | 1157 | ||
| 1167 | __free_page(area->page); | 1158 | __free_page(area->page); |
| @@ -1170,9 +1161,25 @@ static struct xol_area *get_xol_area(void) | |||
| 1170 | free_area: | 1161 | free_area: |
| 1171 | kfree(area); | 1162 | kfree(area); |
| 1172 | out: | 1163 | out: |
| 1164 | return NULL; | ||
| 1165 | } | ||
| 1166 | |||
| 1167 | /* | ||
| 1168 | * get_xol_area - Allocate process's xol_area if necessary. | ||
| 1169 | * This area will be used for storing instructions for execution out of line. | ||
| 1170 | * | ||
| 1171 | * Returns the allocated area or NULL. | ||
| 1172 | */ | ||
| 1173 | static struct xol_area *get_xol_area(void) | ||
| 1174 | { | ||
| 1175 | struct mm_struct *mm = current->mm; | ||
| 1176 | struct xol_area *area; | ||
| 1177 | |||
| 1178 | if (!mm->uprobes_state.xol_area) | ||
| 1179 | __create_xol_area(0); | ||
| 1180 | |||
| 1173 | area = mm->uprobes_state.xol_area; | 1181 | area = mm->uprobes_state.xol_area; |
| 1174 | ret: | 1182 | smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ |
| 1175 | smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ | ||
| 1176 | return area; | 1183 | return area; |
| 1177 | } | 1184 | } |
| 1178 | 1185 | ||
| @@ -1256,7 +1263,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) | |||
| 1256 | return 0; | 1263 | return 0; |
| 1257 | 1264 | ||
| 1258 | /* Initialize the slot */ | 1265 | /* Initialize the slot */ |
| 1259 | copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); | 1266 | copy_to_page(area->page, xol_vaddr, |
| 1267 | uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); | ||
| 1260 | /* | 1268 | /* |
| 1261 | * We probably need flush_icache_user_range() but it needs vma. | 1269 | * We probably need flush_icache_user_range() but it needs vma. |
| 1262 | * This should work on supported architectures too. | 1270 | * This should work on supported architectures too. |
| @@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t) | |||
| 1345 | } | 1353 | } |
| 1346 | 1354 | ||
| 1347 | /* | 1355 | /* |
| 1348 | * Called in context of a new clone/fork from copy_process. | ||
| 1349 | */ | ||
| 1350 | void uprobe_copy_process(struct task_struct *t) | ||
| 1351 | { | ||
| 1352 | t->utask = NULL; | ||
| 1353 | } | ||
| 1354 | |||
| 1355 | /* | ||
| 1356 | * Allocate a uprobe_task object for the task if if necessary. | 1356 | * Allocate a uprobe_task object for the task if if necessary. |
| 1357 | * Called when the thread hits a breakpoint. | 1357 | * Called when the thread hits a breakpoint. |
| 1358 | * | 1358 | * |
| @@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void) | |||
| 1367 | return current->utask; | 1367 | return current->utask; |
| 1368 | } | 1368 | } |
| 1369 | 1369 | ||
| 1370 | static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) | ||
| 1371 | { | ||
| 1372 | struct uprobe_task *n_utask; | ||
| 1373 | struct return_instance **p, *o, *n; | ||
| 1374 | |||
| 1375 | n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); | ||
| 1376 | if (!n_utask) | ||
| 1377 | return -ENOMEM; | ||
| 1378 | t->utask = n_utask; | ||
| 1379 | |||
| 1380 | p = &n_utask->return_instances; | ||
| 1381 | for (o = o_utask->return_instances; o; o = o->next) { | ||
| 1382 | n = kmalloc(sizeof(struct return_instance), GFP_KERNEL); | ||
| 1383 | if (!n) | ||
| 1384 | return -ENOMEM; | ||
| 1385 | |||
| 1386 | *n = *o; | ||
| 1387 | atomic_inc(&n->uprobe->ref); | ||
| 1388 | n->next = NULL; | ||
| 1389 | |||
| 1390 | *p = n; | ||
| 1391 | p = &n->next; | ||
| 1392 | n_utask->depth++; | ||
| 1393 | } | ||
| 1394 | |||
| 1395 | return 0; | ||
| 1396 | } | ||
| 1397 | |||
| 1398 | static void uprobe_warn(struct task_struct *t, const char *msg) | ||
| 1399 | { | ||
| 1400 | pr_warn("uprobe: %s:%d failed to %s\n", | ||
| 1401 | current->comm, current->pid, msg); | ||
| 1402 | } | ||
| 1403 | |||
| 1404 | static void dup_xol_work(struct callback_head *work) | ||
| 1405 | { | ||
| 1406 | kfree(work); | ||
| 1407 | |||
| 1408 | if (current->flags & PF_EXITING) | ||
| 1409 | return; | ||
| 1410 | |||
| 1411 | if (!__create_xol_area(current->utask->vaddr)) | ||
| 1412 | uprobe_warn(current, "dup xol area"); | ||
| 1413 | } | ||
| 1414 | |||
| 1415 | /* | ||
| 1416 | * Called in context of a new clone/fork from copy_process. | ||
| 1417 | */ | ||
| 1418 | void uprobe_copy_process(struct task_struct *t, unsigned long flags) | ||
| 1419 | { | ||
| 1420 | struct uprobe_task *utask = current->utask; | ||
| 1421 | struct mm_struct *mm = current->mm; | ||
| 1422 | struct callback_head *work; | ||
| 1423 | struct xol_area *area; | ||
| 1424 | |||
| 1425 | t->utask = NULL; | ||
| 1426 | |||
| 1427 | if (!utask || !utask->return_instances) | ||
| 1428 | return; | ||
| 1429 | |||
| 1430 | if (mm == t->mm && !(flags & CLONE_VFORK)) | ||
| 1431 | return; | ||
| 1432 | |||
| 1433 | if (dup_utask(t, utask)) | ||
| 1434 | return uprobe_warn(t, "dup ret instances"); | ||
| 1435 | |||
| 1436 | /* The task can fork() after dup_xol_work() fails */ | ||
| 1437 | area = mm->uprobes_state.xol_area; | ||
| 1438 | if (!area) | ||
| 1439 | return uprobe_warn(t, "dup xol area"); | ||
| 1440 | |||
| 1441 | if (mm == t->mm) | ||
| 1442 | return; | ||
| 1443 | |||
| 1444 | /* TODO: move it into the union in uprobe_task */ | ||
| 1445 | work = kmalloc(sizeof(*work), GFP_KERNEL); | ||
| 1446 | if (!work) | ||
| 1447 | return uprobe_warn(t, "dup xol area"); | ||
| 1448 | |||
| 1449 | t->utask->vaddr = area->vaddr; | ||
| 1450 | init_task_work(work, dup_xol_work); | ||
| 1451 | task_work_add(t, work, true); | ||
| 1452 | } | ||
| 1453 | |||
| 1370 | /* | 1454 | /* |
| 1371 | * Current area->vaddr notion assume the trampoline address is always | 1455 | * Current area->vaddr notion assume the trampoline address is always |
| 1372 | * equal area->vaddr. | 1456 | * equal area->vaddr. |
| @@ -1857,9 +1941,4 @@ static int __init init_uprobes(void) | |||
| 1857 | 1941 | ||
| 1858 | return register_die_notifier(&uprobe_exception_nb); | 1942 | return register_die_notifier(&uprobe_exception_nb); |
| 1859 | } | 1943 | } |
| 1860 | module_init(init_uprobes); | 1944 | __initcall(init_uprobes); |
| 1861 | |||
| 1862 | static void __exit exit_uprobes(void) | ||
| 1863 | { | ||
| 1864 | } | ||
| 1865 | module_exit(exit_uprobes); | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 086fe73ad6bd..f6d11fc67f72 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -817,9 +817,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
| 817 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 817 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| 818 | mm->pmd_huge_pte = NULL; | 818 | mm->pmd_huge_pte = NULL; |
| 819 | #endif | 819 | #endif |
| 820 | #ifdef CONFIG_NUMA_BALANCING | ||
| 821 | mm->first_nid = NUMA_PTE_SCAN_INIT; | ||
| 822 | #endif | ||
| 823 | if (!mm_init(mm, tsk)) | 820 | if (!mm_init(mm, tsk)) |
| 824 | goto fail_nomem; | 821 | goto fail_nomem; |
| 825 | 822 | ||
| @@ -1313,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1313 | #endif | 1310 | #endif |
| 1314 | 1311 | ||
| 1315 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1312 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
| 1316 | sched_fork(p); | 1313 | sched_fork(clone_flags, p); |
| 1317 | 1314 | ||
| 1318 | retval = perf_event_init_task(p); | 1315 | retval = perf_event_init_task(p); |
| 1319 | if (retval) | 1316 | if (retval) |
| @@ -1373,7 +1370,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1373 | INIT_LIST_HEAD(&p->pi_state_list); | 1370 | INIT_LIST_HEAD(&p->pi_state_list); |
| 1374 | p->pi_state_cache = NULL; | 1371 | p->pi_state_cache = NULL; |
| 1375 | #endif | 1372 | #endif |
| 1376 | uprobe_copy_process(p); | ||
| 1377 | /* | 1373 | /* |
| 1378 | * sigaltstack should be cleared when sharing the same VM | 1374 | * sigaltstack should be cleared when sharing the same VM |
| 1379 | */ | 1375 | */ |
| @@ -1490,6 +1486,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1490 | perf_event_fork(p); | 1486 | perf_event_fork(p); |
| 1491 | 1487 | ||
| 1492 | trace_task_newtask(p, clone_flags); | 1488 | trace_task_newtask(p, clone_flags); |
| 1489 | uprobe_copy_process(p, clone_flags); | ||
| 1493 | 1490 | ||
| 1494 | return p; | 1491 | return p; |
| 1495 | 1492 | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 514bcfd855a8..3e59f951d42f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -956,7 +956,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 956 | goto out_mput; | 956 | goto out_mput; |
| 957 | } | 957 | } |
| 958 | 958 | ||
| 959 | sched_setscheduler(t, SCHED_FIFO, ¶m); | 959 | sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); |
| 960 | 960 | ||
| 961 | /* | 961 | /* |
| 962 | * We keep the reference to the task struct even if | 962 | * We keep the reference to the task struct even if |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index e16c45b9ee77..4e8e14c34e42 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
| @@ -4224,7 +4224,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) | |||
| 4224 | printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", | 4224 | printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", |
| 4225 | !rcu_lockdep_current_cpu_online() | 4225 | !rcu_lockdep_current_cpu_online() |
| 4226 | ? "RCU used illegally from offline CPU!\n" | 4226 | ? "RCU used illegally from offline CPU!\n" |
| 4227 | : rcu_is_cpu_idle() | 4227 | : !rcu_is_watching() |
| 4228 | ? "RCU used illegally from idle CPU!\n" | 4228 | ? "RCU used illegally from idle CPU!\n" |
| 4229 | : "", | 4229 | : "", |
| 4230 | rcu_scheduler_active, debug_locks); | 4230 | rcu_scheduler_active, debug_locks); |
| @@ -4247,7 +4247,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) | |||
| 4247 | * So complain bitterly if someone does call rcu_read_lock(), | 4247 | * So complain bitterly if someone does call rcu_read_lock(), |
| 4248 | * rcu_read_lock_bh() and so on from extended quiescent states. | 4248 | * rcu_read_lock_bh() and so on from extended quiescent states. |
| 4249 | */ | 4249 | */ |
| 4250 | if (rcu_is_cpu_idle()) | 4250 | if (!rcu_is_watching()) |
| 4251 | printk("RCU used illegally from extended quiescent state!\n"); | 4251 | printk("RCU used illegally from extended quiescent state!\n"); |
| 4252 | 4252 | ||
| 4253 | lockdep_print_held_locks(curr); | 4253 | lockdep_print_held_locks(curr); |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 6d647aedffea..d24105b1b794 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
| @@ -410,7 +410,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, | |||
| 410 | static __always_inline int __sched | 410 | static __always_inline int __sched |
| 411 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | 411 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, |
| 412 | struct lockdep_map *nest_lock, unsigned long ip, | 412 | struct lockdep_map *nest_lock, unsigned long ip, |
| 413 | struct ww_acquire_ctx *ww_ctx) | 413 | struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) |
| 414 | { | 414 | { |
| 415 | struct task_struct *task = current; | 415 | struct task_struct *task = current; |
| 416 | struct mutex_waiter waiter; | 416 | struct mutex_waiter waiter; |
| @@ -450,7 +450,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
| 450 | struct task_struct *owner; | 450 | struct task_struct *owner; |
| 451 | struct mspin_node node; | 451 | struct mspin_node node; |
| 452 | 452 | ||
| 453 | if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { | 453 | if (use_ww_ctx && ww_ctx->acquired > 0) { |
| 454 | struct ww_mutex *ww; | 454 | struct ww_mutex *ww; |
| 455 | 455 | ||
| 456 | ww = container_of(lock, struct ww_mutex, base); | 456 | ww = container_of(lock, struct ww_mutex, base); |
| @@ -480,7 +480,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
| 480 | if ((atomic_read(&lock->count) == 1) && | 480 | if ((atomic_read(&lock->count) == 1) && |
| 481 | (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { | 481 | (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { |
| 482 | lock_acquired(&lock->dep_map, ip); | 482 | lock_acquired(&lock->dep_map, ip); |
| 483 | if (!__builtin_constant_p(ww_ctx == NULL)) { | 483 | if (use_ww_ctx) { |
| 484 | struct ww_mutex *ww; | 484 | struct ww_mutex *ww; |
| 485 | ww = container_of(lock, struct ww_mutex, base); | 485 | ww = container_of(lock, struct ww_mutex, base); |
| 486 | 486 | ||
| @@ -551,7 +551,7 @@ slowpath: | |||
| 551 | goto err; | 551 | goto err; |
| 552 | } | 552 | } |
| 553 | 553 | ||
| 554 | if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { | 554 | if (use_ww_ctx && ww_ctx->acquired > 0) { |
| 555 | ret = __mutex_lock_check_stamp(lock, ww_ctx); | 555 | ret = __mutex_lock_check_stamp(lock, ww_ctx); |
| 556 | if (ret) | 556 | if (ret) |
| 557 | goto err; | 557 | goto err; |
| @@ -575,7 +575,7 @@ skip_wait: | |||
| 575 | lock_acquired(&lock->dep_map, ip); | 575 | lock_acquired(&lock->dep_map, ip); |
| 576 | mutex_set_owner(lock); | 576 | mutex_set_owner(lock); |
| 577 | 577 | ||
| 578 | if (!__builtin_constant_p(ww_ctx == NULL)) { | 578 | if (use_ww_ctx) { |
| 579 | struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); | 579 | struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); |
| 580 | struct mutex_waiter *cur; | 580 | struct mutex_waiter *cur; |
| 581 | 581 | ||
| @@ -615,7 +615,7 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass) | |||
| 615 | { | 615 | { |
| 616 | might_sleep(); | 616 | might_sleep(); |
| 617 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, | 617 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, |
| 618 | subclass, NULL, _RET_IP_, NULL); | 618 | subclass, NULL, _RET_IP_, NULL, 0); |
| 619 | } | 619 | } |
| 620 | 620 | ||
| 621 | EXPORT_SYMBOL_GPL(mutex_lock_nested); | 621 | EXPORT_SYMBOL_GPL(mutex_lock_nested); |
| @@ -625,7 +625,7 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) | |||
| 625 | { | 625 | { |
| 626 | might_sleep(); | 626 | might_sleep(); |
| 627 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, | 627 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, |
| 628 | 0, nest, _RET_IP_, NULL); | 628 | 0, nest, _RET_IP_, NULL, 0); |
| 629 | } | 629 | } |
| 630 | 630 | ||
| 631 | EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); | 631 | EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); |
| @@ -635,7 +635,7 @@ mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) | |||
| 635 | { | 635 | { |
| 636 | might_sleep(); | 636 | might_sleep(); |
| 637 | return __mutex_lock_common(lock, TASK_KILLABLE, | 637 | return __mutex_lock_common(lock, TASK_KILLABLE, |
| 638 | subclass, NULL, _RET_IP_, NULL); | 638 | subclass, NULL, _RET_IP_, NULL, 0); |
| 639 | } | 639 | } |
| 640 | EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); | 640 | EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); |
| 641 | 641 | ||
| @@ -644,7 +644,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) | |||
| 644 | { | 644 | { |
| 645 | might_sleep(); | 645 | might_sleep(); |
| 646 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, | 646 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, |
| 647 | subclass, NULL, _RET_IP_, NULL); | 647 | subclass, NULL, _RET_IP_, NULL, 0); |
| 648 | } | 648 | } |
| 649 | 649 | ||
| 650 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); | 650 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); |
| @@ -682,7 +682,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | |||
| 682 | 682 | ||
| 683 | might_sleep(); | 683 | might_sleep(); |
| 684 | ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, | 684 | ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, |
| 685 | 0, &ctx->dep_map, _RET_IP_, ctx); | 685 | 0, &ctx->dep_map, _RET_IP_, ctx, 1); |
| 686 | if (!ret && ctx->acquired > 1) | 686 | if (!ret && ctx->acquired > 1) |
| 687 | return ww_mutex_deadlock_injection(lock, ctx); | 687 | return ww_mutex_deadlock_injection(lock, ctx); |
| 688 | 688 | ||
| @@ -697,7 +697,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | |||
| 697 | 697 | ||
| 698 | might_sleep(); | 698 | might_sleep(); |
| 699 | ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, | 699 | ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, |
| 700 | 0, &ctx->dep_map, _RET_IP_, ctx); | 700 | 0, &ctx->dep_map, _RET_IP_, ctx, 1); |
| 701 | 701 | ||
| 702 | if (!ret && ctx->acquired > 1) | 702 | if (!ret && ctx->acquired > 1) |
| 703 | return ww_mutex_deadlock_injection(lock, ctx); | 703 | return ww_mutex_deadlock_injection(lock, ctx); |
| @@ -809,28 +809,28 @@ __mutex_lock_slowpath(atomic_t *lock_count) | |||
| 809 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 809 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
| 810 | 810 | ||
| 811 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, | 811 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, |
| 812 | NULL, _RET_IP_, NULL); | 812 | NULL, _RET_IP_, NULL, 0); |
| 813 | } | 813 | } |
| 814 | 814 | ||
| 815 | static noinline int __sched | 815 | static noinline int __sched |
| 816 | __mutex_lock_killable_slowpath(struct mutex *lock) | 816 | __mutex_lock_killable_slowpath(struct mutex *lock) |
| 817 | { | 817 | { |
| 818 | return __mutex_lock_common(lock, TASK_KILLABLE, 0, | 818 | return __mutex_lock_common(lock, TASK_KILLABLE, 0, |
| 819 | NULL, _RET_IP_, NULL); | 819 | NULL, _RET_IP_, NULL, 0); |
| 820 | } | 820 | } |
| 821 | 821 | ||
| 822 | static noinline int __sched | 822 | static noinline int __sched |
| 823 | __mutex_lock_interruptible_slowpath(struct mutex *lock) | 823 | __mutex_lock_interruptible_slowpath(struct mutex *lock) |
| 824 | { | 824 | { |
| 825 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, | 825 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, |
| 826 | NULL, _RET_IP_, NULL); | 826 | NULL, _RET_IP_, NULL, 0); |
| 827 | } | 827 | } |
| 828 | 828 | ||
| 829 | static noinline int __sched | 829 | static noinline int __sched |
| 830 | __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | 830 | __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) |
| 831 | { | 831 | { |
| 832 | return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, | 832 | return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, |
| 833 | NULL, _RET_IP_, ctx); | 833 | NULL, _RET_IP_, ctx, 1); |
| 834 | } | 834 | } |
| 835 | 835 | ||
| 836 | static noinline int __sched | 836 | static noinline int __sched |
| @@ -838,7 +838,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, | |||
| 838 | struct ww_acquire_ctx *ctx) | 838 | struct ww_acquire_ctx *ctx) |
| 839 | { | 839 | { |
| 840 | return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, | 840 | return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, |
| 841 | NULL, _RET_IP_, ctx); | 841 | NULL, _RET_IP_, ctx, 1); |
| 842 | } | 842 | } |
| 843 | 843 | ||
| 844 | #endif | 844 | #endif |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c9c759d5a15c..0121dab83f43 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -846,7 +846,7 @@ static int software_resume(void) | |||
| 846 | goto Finish; | 846 | goto Finish; |
| 847 | } | 847 | } |
| 848 | 848 | ||
| 849 | late_initcall(software_resume); | 849 | late_initcall_sync(software_resume); |
| 850 | 850 | ||
| 851 | 851 | ||
| 852 | static const char * const hibernation_modes[] = { | 852 | static const char * const hibernation_modes[] = { |
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile new file mode 100644 index 000000000000..01e9ec37a3e3 --- /dev/null +++ b/kernel/rcu/Makefile | |||
| @@ -0,0 +1,6 @@ | |||
| 1 | obj-y += update.o srcu.o | ||
| 2 | obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o | ||
| 3 | obj-$(CONFIG_TREE_RCU) += tree.o | ||
| 4 | obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o | ||
| 5 | obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o | ||
| 6 | obj-$(CONFIG_TINY_RCU) += tiny.o | ||
diff --git a/kernel/rcu.h b/kernel/rcu/rcu.h index 77131966c4ad..7859a0a3951e 100644 --- a/kernel/rcu.h +++ b/kernel/rcu/rcu.h | |||
| @@ -122,4 +122,11 @@ int rcu_jiffies_till_stall_check(void); | |||
| 122 | 122 | ||
| 123 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ | 123 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ |
| 124 | 124 | ||
| 125 | /* | ||
| 126 | * Strings used in tracepoints need to be exported via the | ||
| 127 | * tracing system such that tools like perf and trace-cmd can | ||
| 128 | * translate the string address pointers to actual text. | ||
| 129 | */ | ||
| 130 | #define TPS(x) tracepoint_string(x) | ||
| 131 | |||
| 125 | #endif /* __LINUX_RCU_H */ | 132 | #endif /* __LINUX_RCU_H */ |
diff --git a/kernel/srcu.c b/kernel/rcu/srcu.c index 01d5ccb8bfe3..01d5ccb8bfe3 100644 --- a/kernel/srcu.c +++ b/kernel/rcu/srcu.c | |||
diff --git a/kernel/rcutiny.c b/kernel/rcu/tiny.c index 9ed6075dc562..0c9a934cfec1 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcu/tiny.c | |||
| @@ -35,6 +35,7 @@ | |||
| 35 | #include <linux/time.h> | 35 | #include <linux/time.h> |
| 36 | #include <linux/cpu.h> | 36 | #include <linux/cpu.h> |
| 37 | #include <linux/prefetch.h> | 37 | #include <linux/prefetch.h> |
| 38 | #include <linux/ftrace_event.h> | ||
| 38 | 39 | ||
| 39 | #ifdef CONFIG_RCU_TRACE | 40 | #ifdef CONFIG_RCU_TRACE |
| 40 | #include <trace/events/rcu.h> | 41 | #include <trace/events/rcu.h> |
| @@ -42,7 +43,7 @@ | |||
| 42 | 43 | ||
| 43 | #include "rcu.h" | 44 | #include "rcu.h" |
| 44 | 45 | ||
| 45 | /* Forward declarations for rcutiny_plugin.h. */ | 46 | /* Forward declarations for tiny_plugin.h. */ |
| 46 | struct rcu_ctrlblk; | 47 | struct rcu_ctrlblk; |
| 47 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); | 48 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); |
| 48 | static void rcu_process_callbacks(struct softirq_action *unused); | 49 | static void rcu_process_callbacks(struct softirq_action *unused); |
| @@ -52,22 +53,23 @@ static void __call_rcu(struct rcu_head *head, | |||
| 52 | 53 | ||
| 53 | static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 54 | static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
| 54 | 55 | ||
| 55 | #include "rcutiny_plugin.h" | 56 | #include "tiny_plugin.h" |
| 56 | 57 | ||
| 57 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ | 58 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ |
| 58 | static void rcu_idle_enter_common(long long newval) | 59 | static void rcu_idle_enter_common(long long newval) |
| 59 | { | 60 | { |
| 60 | if (newval) { | 61 | if (newval) { |
| 61 | RCU_TRACE(trace_rcu_dyntick("--=", | 62 | RCU_TRACE(trace_rcu_dyntick(TPS("--="), |
| 62 | rcu_dynticks_nesting, newval)); | 63 | rcu_dynticks_nesting, newval)); |
| 63 | rcu_dynticks_nesting = newval; | 64 | rcu_dynticks_nesting = newval; |
| 64 | return; | 65 | return; |
| 65 | } | 66 | } |
| 66 | RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); | 67 | RCU_TRACE(trace_rcu_dyntick(TPS("Start"), |
| 68 | rcu_dynticks_nesting, newval)); | ||
| 67 | if (!is_idle_task(current)) { | 69 | if (!is_idle_task(current)) { |
| 68 | struct task_struct *idle = idle_task(smp_processor_id()); | 70 | struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); |
| 69 | 71 | ||
| 70 | RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", | 72 | RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), |
| 71 | rcu_dynticks_nesting, newval)); | 73 | rcu_dynticks_nesting, newval)); |
| 72 | ftrace_dump(DUMP_ALL); | 74 | ftrace_dump(DUMP_ALL); |
| 73 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 75 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
| @@ -120,15 +122,15 @@ EXPORT_SYMBOL_GPL(rcu_irq_exit); | |||
| 120 | static void rcu_idle_exit_common(long long oldval) | 122 | static void rcu_idle_exit_common(long long oldval) |
| 121 | { | 123 | { |
| 122 | if (oldval) { | 124 | if (oldval) { |
| 123 | RCU_TRACE(trace_rcu_dyntick("++=", | 125 | RCU_TRACE(trace_rcu_dyntick(TPS("++="), |
| 124 | oldval, rcu_dynticks_nesting)); | 126 | oldval, rcu_dynticks_nesting)); |
| 125 | return; | 127 | return; |
| 126 | } | 128 | } |
| 127 | RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); | 129 | RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); |
| 128 | if (!is_idle_task(current)) { | 130 | if (!is_idle_task(current)) { |
| 129 | struct task_struct *idle = idle_task(smp_processor_id()); | 131 | struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); |
| 130 | 132 | ||
| 131 | RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", | 133 | RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), |
| 132 | oldval, rcu_dynticks_nesting)); | 134 | oldval, rcu_dynticks_nesting)); |
| 133 | ftrace_dump(DUMP_ALL); | 135 | ftrace_dump(DUMP_ALL); |
| 134 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 136 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
| @@ -174,18 +176,18 @@ void rcu_irq_enter(void) | |||
| 174 | } | 176 | } |
| 175 | EXPORT_SYMBOL_GPL(rcu_irq_enter); | 177 | EXPORT_SYMBOL_GPL(rcu_irq_enter); |
| 176 | 178 | ||
| 177 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 179 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) |
| 178 | 180 | ||
| 179 | /* | 181 | /* |
| 180 | * Test whether RCU thinks that the current CPU is idle. | 182 | * Test whether RCU thinks that the current CPU is idle. |
| 181 | */ | 183 | */ |
| 182 | int rcu_is_cpu_idle(void) | 184 | bool __rcu_is_watching(void) |
| 183 | { | 185 | { |
| 184 | return !rcu_dynticks_nesting; | 186 | return rcu_dynticks_nesting; |
| 185 | } | 187 | } |
| 186 | EXPORT_SYMBOL(rcu_is_cpu_idle); | 188 | EXPORT_SYMBOL(__rcu_is_watching); |
| 187 | 189 | ||
| 188 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 190 | #endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ |
| 189 | 191 | ||
| 190 | /* | 192 | /* |
| 191 | * Test whether the current CPU was interrupted from idle. Nested | 193 | * Test whether the current CPU was interrupted from idle. Nested |
| @@ -273,7 +275,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
| 273 | if (&rcp->rcucblist == rcp->donetail) { | 275 | if (&rcp->rcucblist == rcp->donetail) { |
| 274 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); | 276 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); |
| 275 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, | 277 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, |
| 276 | ACCESS_ONCE(rcp->rcucblist), | 278 | !!ACCESS_ONCE(rcp->rcucblist), |
| 277 | need_resched(), | 279 | need_resched(), |
| 278 | is_idle_task(current), | 280 | is_idle_task(current), |
| 279 | false)); | 281 | false)); |
| @@ -304,7 +306,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
| 304 | RCU_TRACE(cb_count++); | 306 | RCU_TRACE(cb_count++); |
| 305 | } | 307 | } |
| 306 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | 308 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); |
| 307 | RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), | 309 | RCU_TRACE(trace_rcu_batch_end(rcp->name, |
| 310 | cb_count, 0, need_resched(), | ||
| 308 | is_idle_task(current), | 311 | is_idle_task(current), |
| 309 | false)); | 312 | false)); |
| 310 | } | 313 | } |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcu/tiny_plugin.h index 280d06cae352..280d06cae352 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h | |||
diff --git a/kernel/rcutorture.c b/kernel/rcu/torture.c index be63101c6175..3929cd451511 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcu/torture.c | |||
| @@ -52,6 +52,12 @@ | |||
| 52 | MODULE_LICENSE("GPL"); | 52 | MODULE_LICENSE("GPL"); |
| 53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); | 53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); |
| 54 | 54 | ||
| 55 | MODULE_ALIAS("rcutorture"); | ||
| 56 | #ifdef MODULE_PARAM_PREFIX | ||
| 57 | #undef MODULE_PARAM_PREFIX | ||
| 58 | #endif | ||
| 59 | #define MODULE_PARAM_PREFIX "rcutorture." | ||
| 60 | |||
| 55 | static int fqs_duration; | 61 | static int fqs_duration; |
| 56 | module_param(fqs_duration, int, 0444); | 62 | module_param(fqs_duration, int, 0444); |
| 57 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); | 63 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); |
diff --git a/kernel/rcutree.c b/kernel/rcu/tree.c index 32618b3fe4e6..4c06ddfea7cd 100644 --- a/kernel/rcutree.c +++ b/kernel/rcu/tree.c | |||
| @@ -41,6 +41,7 @@ | |||
| 41 | #include <linux/export.h> | 41 | #include <linux/export.h> |
| 42 | #include <linux/completion.h> | 42 | #include <linux/completion.h> |
| 43 | #include <linux/moduleparam.h> | 43 | #include <linux/moduleparam.h> |
| 44 | #include <linux/module.h> | ||
| 44 | #include <linux/percpu.h> | 45 | #include <linux/percpu.h> |
| 45 | #include <linux/notifier.h> | 46 | #include <linux/notifier.h> |
| 46 | #include <linux/cpu.h> | 47 | #include <linux/cpu.h> |
| @@ -56,17 +57,16 @@ | |||
| 56 | #include <linux/ftrace_event.h> | 57 | #include <linux/ftrace_event.h> |
| 57 | #include <linux/suspend.h> | 58 | #include <linux/suspend.h> |
| 58 | 59 | ||
| 59 | #include "rcutree.h" | 60 | #include "tree.h" |
| 60 | #include <trace/events/rcu.h> | 61 | #include <trace/events/rcu.h> |
| 61 | 62 | ||
| 62 | #include "rcu.h" | 63 | #include "rcu.h" |
| 63 | 64 | ||
| 64 | /* | 65 | MODULE_ALIAS("rcutree"); |
| 65 | * Strings used in tracepoints need to be exported via the | 66 | #ifdef MODULE_PARAM_PREFIX |
| 66 | * tracing system such that tools like perf and trace-cmd can | 67 | #undef MODULE_PARAM_PREFIX |
| 67 | * translate the string address pointers to actual text. | 68 | #endif |
| 68 | */ | 69 | #define MODULE_PARAM_PREFIX "rcutree." |
| 69 | #define TPS(x) tracepoint_string(x) | ||
| 70 | 70 | ||
| 71 | /* Data structures. */ | 71 | /* Data structures. */ |
| 72 | 72 | ||
| @@ -222,7 +222,7 @@ void rcu_note_context_switch(int cpu) | |||
| 222 | } | 222 | } |
| 223 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 223 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
| 224 | 224 | ||
| 225 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 225 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
| 226 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | 226 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, |
| 227 | .dynticks = ATOMIC_INIT(1), | 227 | .dynticks = ATOMIC_INIT(1), |
| 228 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | 228 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE |
| @@ -371,7 +371,8 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | |||
| 371 | { | 371 | { |
| 372 | trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); | 372 | trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); |
| 373 | if (!user && !is_idle_task(current)) { | 373 | if (!user && !is_idle_task(current)) { |
| 374 | struct task_struct *idle = idle_task(smp_processor_id()); | 374 | struct task_struct *idle __maybe_unused = |
| 375 | idle_task(smp_processor_id()); | ||
| 375 | 376 | ||
| 376 | trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); | 377 | trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); |
| 377 | ftrace_dump(DUMP_ORIG); | 378 | ftrace_dump(DUMP_ORIG); |
| @@ -407,7 +408,7 @@ static void rcu_eqs_enter(bool user) | |||
| 407 | long long oldval; | 408 | long long oldval; |
| 408 | struct rcu_dynticks *rdtp; | 409 | struct rcu_dynticks *rdtp; |
| 409 | 410 | ||
| 410 | rdtp = &__get_cpu_var(rcu_dynticks); | 411 | rdtp = this_cpu_ptr(&rcu_dynticks); |
| 411 | oldval = rdtp->dynticks_nesting; | 412 | oldval = rdtp->dynticks_nesting; |
| 412 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); | 413 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); |
| 413 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) | 414 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) |
| @@ -435,7 +436,7 @@ void rcu_idle_enter(void) | |||
| 435 | 436 | ||
| 436 | local_irq_save(flags); | 437 | local_irq_save(flags); |
| 437 | rcu_eqs_enter(false); | 438 | rcu_eqs_enter(false); |
| 438 | rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); | 439 | rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); |
| 439 | local_irq_restore(flags); | 440 | local_irq_restore(flags); |
| 440 | } | 441 | } |
| 441 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | 442 | EXPORT_SYMBOL_GPL(rcu_idle_enter); |
| @@ -478,7 +479,7 @@ void rcu_irq_exit(void) | |||
| 478 | struct rcu_dynticks *rdtp; | 479 | struct rcu_dynticks *rdtp; |
| 479 | 480 | ||
| 480 | local_irq_save(flags); | 481 | local_irq_save(flags); |
| 481 | rdtp = &__get_cpu_var(rcu_dynticks); | 482 | rdtp = this_cpu_ptr(&rcu_dynticks); |
| 482 | oldval = rdtp->dynticks_nesting; | 483 | oldval = rdtp->dynticks_nesting; |
| 483 | rdtp->dynticks_nesting--; | 484 | rdtp->dynticks_nesting--; |
| 484 | WARN_ON_ONCE(rdtp->dynticks_nesting < 0); | 485 | WARN_ON_ONCE(rdtp->dynticks_nesting < 0); |
| @@ -508,7 +509,8 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, | |||
| 508 | rcu_cleanup_after_idle(smp_processor_id()); | 509 | rcu_cleanup_after_idle(smp_processor_id()); |
| 509 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); | 510 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); |
| 510 | if (!user && !is_idle_task(current)) { | 511 | if (!user && !is_idle_task(current)) { |
| 511 | struct task_struct *idle = idle_task(smp_processor_id()); | 512 | struct task_struct *idle __maybe_unused = |
| 513 | idle_task(smp_processor_id()); | ||
| 512 | 514 | ||
| 513 | trace_rcu_dyntick(TPS("Error on exit: not idle task"), | 515 | trace_rcu_dyntick(TPS("Error on exit: not idle task"), |
| 514 | oldval, rdtp->dynticks_nesting); | 516 | oldval, rdtp->dynticks_nesting); |
| @@ -528,7 +530,7 @@ static void rcu_eqs_exit(bool user) | |||
| 528 | struct rcu_dynticks *rdtp; | 530 | struct rcu_dynticks *rdtp; |
| 529 | long long oldval; | 531 | long long oldval; |
| 530 | 532 | ||
| 531 | rdtp = &__get_cpu_var(rcu_dynticks); | 533 | rdtp = this_cpu_ptr(&rcu_dynticks); |
| 532 | oldval = rdtp->dynticks_nesting; | 534 | oldval = rdtp->dynticks_nesting; |
| 533 | WARN_ON_ONCE(oldval < 0); | 535 | WARN_ON_ONCE(oldval < 0); |
| 534 | if (oldval & DYNTICK_TASK_NEST_MASK) | 536 | if (oldval & DYNTICK_TASK_NEST_MASK) |
| @@ -555,7 +557,7 @@ void rcu_idle_exit(void) | |||
| 555 | 557 | ||
| 556 | local_irq_save(flags); | 558 | local_irq_save(flags); |
| 557 | rcu_eqs_exit(false); | 559 | rcu_eqs_exit(false); |
| 558 | rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); | 560 | rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); |
| 559 | local_irq_restore(flags); | 561 | local_irq_restore(flags); |
| 560 | } | 562 | } |
| 561 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | 563 | EXPORT_SYMBOL_GPL(rcu_idle_exit); |
| @@ -599,7 +601,7 @@ void rcu_irq_enter(void) | |||
| 599 | long long oldval; | 601 | long long oldval; |
| 600 | 602 | ||
| 601 | local_irq_save(flags); | 603 | local_irq_save(flags); |
| 602 | rdtp = &__get_cpu_var(rcu_dynticks); | 604 | rdtp = this_cpu_ptr(&rcu_dynticks); |
| 603 | oldval = rdtp->dynticks_nesting; | 605 | oldval = rdtp->dynticks_nesting; |
| 604 | rdtp->dynticks_nesting++; | 606 | rdtp->dynticks_nesting++; |
| 605 | WARN_ON_ONCE(rdtp->dynticks_nesting == 0); | 607 | WARN_ON_ONCE(rdtp->dynticks_nesting == 0); |
| @@ -620,7 +622,7 @@ void rcu_irq_enter(void) | |||
| 620 | */ | 622 | */ |
| 621 | void rcu_nmi_enter(void) | 623 | void rcu_nmi_enter(void) |
| 622 | { | 624 | { |
| 623 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 625 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
| 624 | 626 | ||
| 625 | if (rdtp->dynticks_nmi_nesting == 0 && | 627 | if (rdtp->dynticks_nmi_nesting == 0 && |
| 626 | (atomic_read(&rdtp->dynticks) & 0x1)) | 628 | (atomic_read(&rdtp->dynticks) & 0x1)) |
| @@ -642,7 +644,7 @@ void rcu_nmi_enter(void) | |||
| 642 | */ | 644 | */ |
| 643 | void rcu_nmi_exit(void) | 645 | void rcu_nmi_exit(void) |
| 644 | { | 646 | { |
| 645 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 647 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
| 646 | 648 | ||
| 647 | if (rdtp->dynticks_nmi_nesting == 0 || | 649 | if (rdtp->dynticks_nmi_nesting == 0 || |
| 648 | --rdtp->dynticks_nmi_nesting != 0) | 650 | --rdtp->dynticks_nmi_nesting != 0) |
| @@ -655,21 +657,34 @@ void rcu_nmi_exit(void) | |||
| 655 | } | 657 | } |
| 656 | 658 | ||
| 657 | /** | 659 | /** |
| 658 | * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle | 660 | * __rcu_is_watching - are RCU read-side critical sections safe? |
| 661 | * | ||
| 662 | * Return true if RCU is watching the running CPU, which means that | ||
| 663 | * this CPU can safely enter RCU read-side critical sections. Unlike | ||
| 664 | * rcu_is_watching(), the caller of __rcu_is_watching() must have at | ||
| 665 | * least disabled preemption. | ||
| 666 | */ | ||
| 667 | bool __rcu_is_watching(void) | ||
| 668 | { | ||
| 669 | return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; | ||
| 670 | } | ||
| 671 | |||
| 672 | /** | ||
| 673 | * rcu_is_watching - see if RCU thinks that the current CPU is idle | ||
| 659 | * | 674 | * |
| 660 | * If the current CPU is in its idle loop and is neither in an interrupt | 675 | * If the current CPU is in its idle loop and is neither in an interrupt |
| 661 | * or NMI handler, return true. | 676 | * or NMI handler, return true. |
| 662 | */ | 677 | */ |
| 663 | int rcu_is_cpu_idle(void) | 678 | bool rcu_is_watching(void) |
| 664 | { | 679 | { |
| 665 | int ret; | 680 | int ret; |
| 666 | 681 | ||
| 667 | preempt_disable(); | 682 | preempt_disable(); |
| 668 | ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; | 683 | ret = __rcu_is_watching(); |
| 669 | preempt_enable(); | 684 | preempt_enable(); |
| 670 | return ret; | 685 | return ret; |
| 671 | } | 686 | } |
| 672 | EXPORT_SYMBOL(rcu_is_cpu_idle); | 687 | EXPORT_SYMBOL_GPL(rcu_is_watching); |
| 673 | 688 | ||
| 674 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) | 689 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) |
| 675 | 690 | ||
| @@ -703,7 +718,7 @@ bool rcu_lockdep_current_cpu_online(void) | |||
| 703 | if (in_nmi()) | 718 | if (in_nmi()) |
| 704 | return 1; | 719 | return 1; |
| 705 | preempt_disable(); | 720 | preempt_disable(); |
| 706 | rdp = &__get_cpu_var(rcu_sched_data); | 721 | rdp = this_cpu_ptr(&rcu_sched_data); |
| 707 | rnp = rdp->mynode; | 722 | rnp = rdp->mynode; |
| 708 | ret = (rdp->grpmask & rnp->qsmaskinit) || | 723 | ret = (rdp->grpmask & rnp->qsmaskinit) || |
| 709 | !rcu_scheduler_fully_active; | 724 | !rcu_scheduler_fully_active; |
| @@ -723,7 +738,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); | |||
| 723 | */ | 738 | */ |
| 724 | static int rcu_is_cpu_rrupt_from_idle(void) | 739 | static int rcu_is_cpu_rrupt_from_idle(void) |
| 725 | { | 740 | { |
| 726 | return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; | 741 | return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; |
| 727 | } | 742 | } |
| 728 | 743 | ||
| 729 | /* | 744 | /* |
| @@ -802,8 +817,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
| 802 | 817 | ||
| 803 | static void record_gp_stall_check_time(struct rcu_state *rsp) | 818 | static void record_gp_stall_check_time(struct rcu_state *rsp) |
| 804 | { | 819 | { |
| 805 | rsp->gp_start = jiffies; | 820 | unsigned long j = ACCESS_ONCE(jiffies); |
| 806 | rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); | 821 | |
| 822 | rsp->gp_start = j; | ||
| 823 | smp_wmb(); /* Record start time before stall time. */ | ||
| 824 | rsp->jiffies_stall = j + rcu_jiffies_till_stall_check(); | ||
| 807 | } | 825 | } |
| 808 | 826 | ||
| 809 | /* | 827 | /* |
| @@ -898,6 +916,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
| 898 | force_quiescent_state(rsp); /* Kick them all. */ | 916 | force_quiescent_state(rsp); /* Kick them all. */ |
| 899 | } | 917 | } |
| 900 | 918 | ||
| 919 | /* | ||
| 920 | * This function really isn't for public consumption, but RCU is special in | ||
| 921 | * that context switches can allow the state machine to make progress. | ||
| 922 | */ | ||
| 923 | extern void resched_cpu(int cpu); | ||
| 924 | |||
| 901 | static void print_cpu_stall(struct rcu_state *rsp) | 925 | static void print_cpu_stall(struct rcu_state *rsp) |
| 902 | { | 926 | { |
| 903 | int cpu; | 927 | int cpu; |
| @@ -927,22 +951,60 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
| 927 | 3 * rcu_jiffies_till_stall_check() + 3; | 951 | 3 * rcu_jiffies_till_stall_check() + 3; |
| 928 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 952 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 929 | 953 | ||
| 930 | set_need_resched(); /* kick ourselves to get things going. */ | 954 | /* |
| 955 | * Attempt to revive the RCU machinery by forcing a context switch. | ||
| 956 | * | ||
| 957 | * A context switch would normally allow the RCU state machine to make | ||
| 958 | * progress and it could be we're stuck in kernel space without context | ||
| 959 | * switches for an entirely unreasonable amount of time. | ||
| 960 | */ | ||
| 961 | resched_cpu(smp_processor_id()); | ||
| 931 | } | 962 | } |
| 932 | 963 | ||
| 933 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | 964 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) |
| 934 | { | 965 | { |
| 966 | unsigned long completed; | ||
| 967 | unsigned long gpnum; | ||
| 968 | unsigned long gps; | ||
| 935 | unsigned long j; | 969 | unsigned long j; |
| 936 | unsigned long js; | 970 | unsigned long js; |
| 937 | struct rcu_node *rnp; | 971 | struct rcu_node *rnp; |
| 938 | 972 | ||
| 939 | if (rcu_cpu_stall_suppress) | 973 | if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) |
| 940 | return; | 974 | return; |
| 941 | j = ACCESS_ONCE(jiffies); | 975 | j = ACCESS_ONCE(jiffies); |
| 976 | |||
| 977 | /* | ||
| 978 | * Lots of memory barriers to reject false positives. | ||
| 979 | * | ||
| 980 | * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, | ||
| 981 | * then rsp->gp_start, and finally rsp->completed. These values | ||
| 982 | * are updated in the opposite order with memory barriers (or | ||
| 983 | * equivalent) during grace-period initialization and cleanup. | ||
| 984 | * Now, a false positive can occur if we get an new value of | ||
| 985 | * rsp->gp_start and a old value of rsp->jiffies_stall. But given | ||
| 986 | * the memory barriers, the only way that this can happen is if one | ||
| 987 | * grace period ends and another starts between these two fetches. | ||
| 988 | * Detect this by comparing rsp->completed with the previous fetch | ||
| 989 | * from rsp->gpnum. | ||
| 990 | * | ||
| 991 | * Given this check, comparisons of jiffies, rsp->jiffies_stall, | ||
| 992 | * and rsp->gp_start suffice to forestall false positives. | ||
| 993 | */ | ||
| 994 | gpnum = ACCESS_ONCE(rsp->gpnum); | ||
| 995 | smp_rmb(); /* Pick up ->gpnum first... */ | ||
| 942 | js = ACCESS_ONCE(rsp->jiffies_stall); | 996 | js = ACCESS_ONCE(rsp->jiffies_stall); |
| 997 | smp_rmb(); /* ...then ->jiffies_stall before the rest... */ | ||
| 998 | gps = ACCESS_ONCE(rsp->gp_start); | ||
| 999 | smp_rmb(); /* ...and finally ->gp_start before ->completed. */ | ||
| 1000 | completed = ACCESS_ONCE(rsp->completed); | ||
| 1001 | if (ULONG_CMP_GE(completed, gpnum) || | ||
| 1002 | ULONG_CMP_LT(j, js) || | ||
| 1003 | ULONG_CMP_GE(gps, js)) | ||
| 1004 | return; /* No stall or GP completed since entering function. */ | ||
| 943 | rnp = rdp->mynode; | 1005 | rnp = rdp->mynode; |
| 944 | if (rcu_gp_in_progress(rsp) && | 1006 | if (rcu_gp_in_progress(rsp) && |
| 945 | (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { | 1007 | (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) { |
| 946 | 1008 | ||
| 947 | /* We haven't checked in, so go dump stack. */ | 1009 | /* We haven't checked in, so go dump stack. */ |
| 948 | print_cpu_stall(rsp); | 1010 | print_cpu_stall(rsp); |
| @@ -1297,7 +1359,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1297 | } | 1359 | } |
| 1298 | 1360 | ||
| 1299 | /* | 1361 | /* |
| 1300 | * Initialize a new grace period. | 1362 | * Initialize a new grace period. Return 0 if no grace period required. |
| 1301 | */ | 1363 | */ |
| 1302 | static int rcu_gp_init(struct rcu_state *rsp) | 1364 | static int rcu_gp_init(struct rcu_state *rsp) |
| 1303 | { | 1365 | { |
| @@ -1306,18 +1368,27 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
| 1306 | 1368 | ||
| 1307 | rcu_bind_gp_kthread(); | 1369 | rcu_bind_gp_kthread(); |
| 1308 | raw_spin_lock_irq(&rnp->lock); | 1370 | raw_spin_lock_irq(&rnp->lock); |
| 1371 | if (rsp->gp_flags == 0) { | ||
| 1372 | /* Spurious wakeup, tell caller to go back to sleep. */ | ||
| 1373 | raw_spin_unlock_irq(&rnp->lock); | ||
| 1374 | return 0; | ||
| 1375 | } | ||
| 1309 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ | 1376 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ |
| 1310 | 1377 | ||
| 1311 | if (rcu_gp_in_progress(rsp)) { | 1378 | if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { |
| 1312 | /* Grace period already in progress, don't start another. */ | 1379 | /* |
| 1380 | * Grace period already in progress, don't start another. | ||
| 1381 | * Not supposed to be able to happen. | ||
| 1382 | */ | ||
| 1313 | raw_spin_unlock_irq(&rnp->lock); | 1383 | raw_spin_unlock_irq(&rnp->lock); |
| 1314 | return 0; | 1384 | return 0; |
| 1315 | } | 1385 | } |
| 1316 | 1386 | ||
| 1317 | /* Advance to a new grace period and initialize state. */ | 1387 | /* Advance to a new grace period and initialize state. */ |
| 1388 | record_gp_stall_check_time(rsp); | ||
| 1389 | smp_wmb(); /* Record GP times before starting GP. */ | ||
| 1318 | rsp->gpnum++; | 1390 | rsp->gpnum++; |
| 1319 | trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); | 1391 | trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); |
| 1320 | record_gp_stall_check_time(rsp); | ||
| 1321 | raw_spin_unlock_irq(&rnp->lock); | 1392 | raw_spin_unlock_irq(&rnp->lock); |
| 1322 | 1393 | ||
| 1323 | /* Exclude any concurrent CPU-hotplug operations. */ | 1394 | /* Exclude any concurrent CPU-hotplug operations. */ |
| @@ -1366,7 +1437,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
| 1366 | /* | 1437 | /* |
| 1367 | * Do one round of quiescent-state forcing. | 1438 | * Do one round of quiescent-state forcing. |
| 1368 | */ | 1439 | */ |
| 1369 | int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | 1440 | static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) |
| 1370 | { | 1441 | { |
| 1371 | int fqs_state = fqs_state_in; | 1442 | int fqs_state = fqs_state_in; |
| 1372 | bool isidle = false; | 1443 | bool isidle = false; |
| @@ -1451,8 +1522,12 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
| 1451 | rsp->fqs_state = RCU_GP_IDLE; | 1522 | rsp->fqs_state = RCU_GP_IDLE; |
| 1452 | rdp = this_cpu_ptr(rsp->rda); | 1523 | rdp = this_cpu_ptr(rsp->rda); |
| 1453 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ | 1524 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ |
| 1454 | if (cpu_needs_another_gp(rsp, rdp)) | 1525 | if (cpu_needs_another_gp(rsp, rdp)) { |
| 1455 | rsp->gp_flags = 1; | 1526 | rsp->gp_flags = RCU_GP_FLAG_INIT; |
| 1527 | trace_rcu_grace_period(rsp->name, | ||
| 1528 | ACCESS_ONCE(rsp->gpnum), | ||
| 1529 | TPS("newreq")); | ||
| 1530 | } | ||
| 1456 | raw_spin_unlock_irq(&rnp->lock); | 1531 | raw_spin_unlock_irq(&rnp->lock); |
| 1457 | } | 1532 | } |
| 1458 | 1533 | ||
| @@ -1462,6 +1537,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
| 1462 | static int __noreturn rcu_gp_kthread(void *arg) | 1537 | static int __noreturn rcu_gp_kthread(void *arg) |
| 1463 | { | 1538 | { |
| 1464 | int fqs_state; | 1539 | int fqs_state; |
| 1540 | int gf; | ||
| 1465 | unsigned long j; | 1541 | unsigned long j; |
| 1466 | int ret; | 1542 | int ret; |
| 1467 | struct rcu_state *rsp = arg; | 1543 | struct rcu_state *rsp = arg; |
| @@ -1471,14 +1547,19 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
| 1471 | 1547 | ||
| 1472 | /* Handle grace-period start. */ | 1548 | /* Handle grace-period start. */ |
| 1473 | for (;;) { | 1549 | for (;;) { |
| 1550 | trace_rcu_grace_period(rsp->name, | ||
| 1551 | ACCESS_ONCE(rsp->gpnum), | ||
| 1552 | TPS("reqwait")); | ||
| 1474 | wait_event_interruptible(rsp->gp_wq, | 1553 | wait_event_interruptible(rsp->gp_wq, |
| 1475 | rsp->gp_flags & | 1554 | ACCESS_ONCE(rsp->gp_flags) & |
| 1476 | RCU_GP_FLAG_INIT); | 1555 | RCU_GP_FLAG_INIT); |
| 1477 | if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && | 1556 | if (rcu_gp_init(rsp)) |
| 1478 | rcu_gp_init(rsp)) | ||
| 1479 | break; | 1557 | break; |
| 1480 | cond_resched(); | 1558 | cond_resched(); |
| 1481 | flush_signals(current); | 1559 | flush_signals(current); |
| 1560 | trace_rcu_grace_period(rsp->name, | ||
| 1561 | ACCESS_ONCE(rsp->gpnum), | ||
| 1562 | TPS("reqwaitsig")); | ||
| 1482 | } | 1563 | } |
| 1483 | 1564 | ||
| 1484 | /* Handle quiescent-state forcing. */ | 1565 | /* Handle quiescent-state forcing. */ |
| @@ -1488,10 +1569,16 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
| 1488 | j = HZ; | 1569 | j = HZ; |
| 1489 | jiffies_till_first_fqs = HZ; | 1570 | jiffies_till_first_fqs = HZ; |
| 1490 | } | 1571 | } |
| 1572 | ret = 0; | ||
| 1491 | for (;;) { | 1573 | for (;;) { |
| 1492 | rsp->jiffies_force_qs = jiffies + j; | 1574 | if (!ret) |
| 1575 | rsp->jiffies_force_qs = jiffies + j; | ||
| 1576 | trace_rcu_grace_period(rsp->name, | ||
| 1577 | ACCESS_ONCE(rsp->gpnum), | ||
| 1578 | TPS("fqswait")); | ||
| 1493 | ret = wait_event_interruptible_timeout(rsp->gp_wq, | 1579 | ret = wait_event_interruptible_timeout(rsp->gp_wq, |
| 1494 | (rsp->gp_flags & RCU_GP_FLAG_FQS) || | 1580 | ((gf = ACCESS_ONCE(rsp->gp_flags)) & |
| 1581 | RCU_GP_FLAG_FQS) || | ||
| 1495 | (!ACCESS_ONCE(rnp->qsmask) && | 1582 | (!ACCESS_ONCE(rnp->qsmask) && |
| 1496 | !rcu_preempt_blocked_readers_cgp(rnp)), | 1583 | !rcu_preempt_blocked_readers_cgp(rnp)), |
| 1497 | j); | 1584 | j); |
| @@ -1500,13 +1587,23 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
| 1500 | !rcu_preempt_blocked_readers_cgp(rnp)) | 1587 | !rcu_preempt_blocked_readers_cgp(rnp)) |
| 1501 | break; | 1588 | break; |
| 1502 | /* If time for quiescent-state forcing, do it. */ | 1589 | /* If time for quiescent-state forcing, do it. */ |
| 1503 | if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { | 1590 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || |
| 1591 | (gf & RCU_GP_FLAG_FQS)) { | ||
| 1592 | trace_rcu_grace_period(rsp->name, | ||
| 1593 | ACCESS_ONCE(rsp->gpnum), | ||
| 1594 | TPS("fqsstart")); | ||
| 1504 | fqs_state = rcu_gp_fqs(rsp, fqs_state); | 1595 | fqs_state = rcu_gp_fqs(rsp, fqs_state); |
| 1596 | trace_rcu_grace_period(rsp->name, | ||
| 1597 | ACCESS_ONCE(rsp->gpnum), | ||
| 1598 | TPS("fqsend")); | ||
| 1505 | cond_resched(); | 1599 | cond_resched(); |
| 1506 | } else { | 1600 | } else { |
| 1507 | /* Deal with stray signal. */ | 1601 | /* Deal with stray signal. */ |
| 1508 | cond_resched(); | 1602 | cond_resched(); |
| 1509 | flush_signals(current); | 1603 | flush_signals(current); |
| 1604 | trace_rcu_grace_period(rsp->name, | ||
| 1605 | ACCESS_ONCE(rsp->gpnum), | ||
| 1606 | TPS("fqswaitsig")); | ||
| 1510 | } | 1607 | } |
| 1511 | j = jiffies_till_next_fqs; | 1608 | j = jiffies_till_next_fqs; |
| 1512 | if (j > HZ) { | 1609 | if (j > HZ) { |
| @@ -1554,6 +1651,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1554 | return; | 1651 | return; |
| 1555 | } | 1652 | } |
| 1556 | rsp->gp_flags = RCU_GP_FLAG_INIT; | 1653 | rsp->gp_flags = RCU_GP_FLAG_INIT; |
| 1654 | trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), | ||
| 1655 | TPS("newreq")); | ||
| 1557 | 1656 | ||
| 1558 | /* | 1657 | /* |
| 1559 | * We can't do wakeups while holding the rnp->lock, as that | 1658 | * We can't do wakeups while holding the rnp->lock, as that |
| @@ -2255,7 +2354,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
| 2255 | * If called from an extended quiescent state, invoke the RCU | 2354 | * If called from an extended quiescent state, invoke the RCU |
| 2256 | * core in order to force a re-evaluation of RCU's idleness. | 2355 | * core in order to force a re-evaluation of RCU's idleness. |
| 2257 | */ | 2356 | */ |
| 2258 | if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) | 2357 | if (!rcu_is_watching() && cpu_online(smp_processor_id())) |
| 2259 | invoke_rcu_core(); | 2358 | invoke_rcu_core(); |
| 2260 | 2359 | ||
| 2261 | /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ | 2360 | /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ |
| @@ -2725,10 +2824,13 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) | |||
| 2725 | 2824 | ||
| 2726 | for_each_rcu_flavor(rsp) { | 2825 | for_each_rcu_flavor(rsp) { |
| 2727 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2826 | rdp = per_cpu_ptr(rsp->rda, cpu); |
| 2728 | if (rdp->qlen != rdp->qlen_lazy) | 2827 | if (!rdp->nxtlist) |
| 2828 | continue; | ||
| 2829 | hc = true; | ||
| 2830 | if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { | ||
| 2729 | al = false; | 2831 | al = false; |
| 2730 | if (rdp->nxtlist) | 2832 | break; |
| 2731 | hc = true; | 2833 | } |
| 2732 | } | 2834 | } |
| 2733 | if (all_lazy) | 2835 | if (all_lazy) |
| 2734 | *all_lazy = al; | 2836 | *all_lazy = al; |
| @@ -3216,7 +3318,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
| 3216 | 3318 | ||
| 3217 | /* | 3319 | /* |
| 3218 | * Compute the rcu_node tree geometry from kernel parameters. This cannot | 3320 | * Compute the rcu_node tree geometry from kernel parameters. This cannot |
| 3219 | * replace the definitions in rcutree.h because those are needed to size | 3321 | * replace the definitions in tree.h because those are needed to size |
| 3220 | * the ->node array in the rcu_state structure. | 3322 | * the ->node array in the rcu_state structure. |
| 3221 | */ | 3323 | */ |
| 3222 | static void __init rcu_init_geometry(void) | 3324 | static void __init rcu_init_geometry(void) |
| @@ -3295,8 +3397,8 @@ void __init rcu_init(void) | |||
| 3295 | 3397 | ||
| 3296 | rcu_bootup_announce(); | 3398 | rcu_bootup_announce(); |
| 3297 | rcu_init_geometry(); | 3399 | rcu_init_geometry(); |
| 3298 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | ||
| 3299 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 3400 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
| 3401 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | ||
| 3300 | __rcu_init_preempt(); | 3402 | __rcu_init_preempt(); |
| 3301 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 3403 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
| 3302 | 3404 | ||
| @@ -3311,4 +3413,4 @@ void __init rcu_init(void) | |||
| 3311 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 3413 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); |
| 3312 | } | 3414 | } |
| 3313 | 3415 | ||
| 3314 | #include "rcutree_plugin.h" | 3416 | #include "tree_plugin.h" |
diff --git a/kernel/rcutree.h b/kernel/rcu/tree.h index 5f97eab602cd..52be957c9fe2 100644 --- a/kernel/rcutree.h +++ b/kernel/rcu/tree.h | |||
| @@ -104,6 +104,8 @@ struct rcu_dynticks { | |||
| 104 | /* idle-period nonlazy_posted snapshot. */ | 104 | /* idle-period nonlazy_posted snapshot. */ |
| 105 | unsigned long last_accelerate; | 105 | unsigned long last_accelerate; |
| 106 | /* Last jiffy CBs were accelerated. */ | 106 | /* Last jiffy CBs were accelerated. */ |
| 107 | unsigned long last_advance_all; | ||
| 108 | /* Last jiffy CBs were all advanced. */ | ||
| 107 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ | 109 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ |
| 108 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 110 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
| 109 | }; | 111 | }; |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcu/tree_plugin.h index 130c97b027f2..3822ac0c4b27 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
| @@ -28,7 +28,7 @@ | |||
| 28 | #include <linux/gfp.h> | 28 | #include <linux/gfp.h> |
| 29 | #include <linux/oom.h> | 29 | #include <linux/oom.h> |
| 30 | #include <linux/smpboot.h> | 30 | #include <linux/smpboot.h> |
| 31 | #include "time/tick-internal.h" | 31 | #include "../time/tick-internal.h" |
| 32 | 32 | ||
| 33 | #define RCU_KTHREAD_PRIO 1 | 33 | #define RCU_KTHREAD_PRIO 1 |
| 34 | 34 | ||
| @@ -96,10 +96,15 @@ static void __init rcu_bootup_announce_oddness(void) | |||
| 96 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ | 96 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ |
| 97 | #ifdef CONFIG_RCU_NOCB_CPU_ALL | 97 | #ifdef CONFIG_RCU_NOCB_CPU_ALL |
| 98 | pr_info("\tOffload RCU callbacks from all CPUs\n"); | 98 | pr_info("\tOffload RCU callbacks from all CPUs\n"); |
| 99 | cpumask_setall(rcu_nocb_mask); | 99 | cpumask_copy(rcu_nocb_mask, cpu_possible_mask); |
| 100 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ | 100 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ |
| 101 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ | 101 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ |
| 102 | if (have_rcu_nocb_mask) { | 102 | if (have_rcu_nocb_mask) { |
| 103 | if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { | ||
| 104 | pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); | ||
| 105 | cpumask_and(rcu_nocb_mask, cpu_possible_mask, | ||
| 106 | rcu_nocb_mask); | ||
| 107 | } | ||
| 103 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); | 108 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); |
| 104 | pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); | 109 | pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); |
| 105 | if (rcu_nocb_poll) | 110 | if (rcu_nocb_poll) |
| @@ -660,7 +665,7 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
| 660 | 665 | ||
| 661 | static void rcu_preempt_do_callbacks(void) | 666 | static void rcu_preempt_do_callbacks(void) |
| 662 | { | 667 | { |
| 663 | rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); | 668 | rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); |
| 664 | } | 669 | } |
| 665 | 670 | ||
| 666 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 671 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| @@ -1128,7 +1133,7 @@ void exit_rcu(void) | |||
| 1128 | 1133 | ||
| 1129 | #ifdef CONFIG_RCU_BOOST | 1134 | #ifdef CONFIG_RCU_BOOST |
| 1130 | 1135 | ||
| 1131 | #include "rtmutex_common.h" | 1136 | #include "../rtmutex_common.h" |
| 1132 | 1137 | ||
| 1133 | #ifdef CONFIG_RCU_TRACE | 1138 | #ifdef CONFIG_RCU_TRACE |
| 1134 | 1139 | ||
| @@ -1332,7 +1337,7 @@ static void invoke_rcu_callbacks_kthread(void) | |||
| 1332 | */ | 1337 | */ |
| 1333 | static bool rcu_is_callbacks_kthread(void) | 1338 | static bool rcu_is_callbacks_kthread(void) |
| 1334 | { | 1339 | { |
| 1335 | return __get_cpu_var(rcu_cpu_kthread_task) == current; | 1340 | return __this_cpu_read(rcu_cpu_kthread_task) == current; |
| 1336 | } | 1341 | } |
| 1337 | 1342 | ||
| 1338 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) | 1343 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) |
| @@ -1382,8 +1387,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |||
| 1382 | 1387 | ||
| 1383 | static void rcu_kthread_do_work(void) | 1388 | static void rcu_kthread_do_work(void) |
| 1384 | { | 1389 | { |
| 1385 | rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); | 1390 | rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); |
| 1386 | rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); | 1391 | rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); |
| 1387 | rcu_preempt_do_callbacks(); | 1392 | rcu_preempt_do_callbacks(); |
| 1388 | } | 1393 | } |
| 1389 | 1394 | ||
| @@ -1402,7 +1407,7 @@ static void rcu_cpu_kthread_park(unsigned int cpu) | |||
| 1402 | 1407 | ||
| 1403 | static int rcu_cpu_kthread_should_run(unsigned int cpu) | 1408 | static int rcu_cpu_kthread_should_run(unsigned int cpu) |
| 1404 | { | 1409 | { |
| 1405 | return __get_cpu_var(rcu_cpu_has_work); | 1410 | return __this_cpu_read(rcu_cpu_has_work); |
| 1406 | } | 1411 | } |
| 1407 | 1412 | ||
| 1408 | /* | 1413 | /* |
| @@ -1412,8 +1417,8 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu) | |||
| 1412 | */ | 1417 | */ |
| 1413 | static void rcu_cpu_kthread(unsigned int cpu) | 1418 | static void rcu_cpu_kthread(unsigned int cpu) |
| 1414 | { | 1419 | { |
| 1415 | unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); | 1420 | unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); |
| 1416 | char work, *workp = &__get_cpu_var(rcu_cpu_has_work); | 1421 | char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); |
| 1417 | int spincnt; | 1422 | int spincnt; |
| 1418 | 1423 | ||
| 1419 | for (spincnt = 0; spincnt < 10; spincnt++) { | 1424 | for (spincnt = 0; spincnt < 10; spincnt++) { |
| @@ -1630,17 +1635,23 @@ module_param(rcu_idle_lazy_gp_delay, int, 0644); | |||
| 1630 | extern int tick_nohz_enabled; | 1635 | extern int tick_nohz_enabled; |
| 1631 | 1636 | ||
| 1632 | /* | 1637 | /* |
| 1633 | * Try to advance callbacks for all flavors of RCU on the current CPU. | 1638 | * Try to advance callbacks for all flavors of RCU on the current CPU, but |
| 1634 | * Afterwards, if there are any callbacks ready for immediate invocation, | 1639 | * only if it has been awhile since the last time we did so. Afterwards, |
| 1635 | * return true. | 1640 | * if there are any callbacks ready for immediate invocation, return true. |
| 1636 | */ | 1641 | */ |
| 1637 | static bool rcu_try_advance_all_cbs(void) | 1642 | static bool rcu_try_advance_all_cbs(void) |
| 1638 | { | 1643 | { |
| 1639 | bool cbs_ready = false; | 1644 | bool cbs_ready = false; |
| 1640 | struct rcu_data *rdp; | 1645 | struct rcu_data *rdp; |
| 1646 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 1641 | struct rcu_node *rnp; | 1647 | struct rcu_node *rnp; |
| 1642 | struct rcu_state *rsp; | 1648 | struct rcu_state *rsp; |
| 1643 | 1649 | ||
| 1650 | /* Exit early if we advanced recently. */ | ||
| 1651 | if (jiffies == rdtp->last_advance_all) | ||
| 1652 | return 0; | ||
| 1653 | rdtp->last_advance_all = jiffies; | ||
| 1654 | |||
| 1644 | for_each_rcu_flavor(rsp) { | 1655 | for_each_rcu_flavor(rsp) { |
| 1645 | rdp = this_cpu_ptr(rsp->rda); | 1656 | rdp = this_cpu_ptr(rsp->rda); |
| 1646 | rnp = rdp->mynode; | 1657 | rnp = rdp->mynode; |
| @@ -1739,6 +1750,8 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 1739 | */ | 1750 | */ |
| 1740 | if (rdtp->all_lazy && | 1751 | if (rdtp->all_lazy && |
| 1741 | rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { | 1752 | rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { |
| 1753 | rdtp->all_lazy = false; | ||
| 1754 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | ||
| 1742 | invoke_rcu_core(); | 1755 | invoke_rcu_core(); |
| 1743 | return; | 1756 | return; |
| 1744 | } | 1757 | } |
| @@ -1768,17 +1781,11 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 1768 | */ | 1781 | */ |
| 1769 | static void rcu_cleanup_after_idle(int cpu) | 1782 | static void rcu_cleanup_after_idle(int cpu) |
| 1770 | { | 1783 | { |
| 1771 | struct rcu_data *rdp; | ||
| 1772 | struct rcu_state *rsp; | ||
| 1773 | 1784 | ||
| 1774 | if (rcu_is_nocb_cpu(cpu)) | 1785 | if (rcu_is_nocb_cpu(cpu)) |
| 1775 | return; | 1786 | return; |
| 1776 | rcu_try_advance_all_cbs(); | 1787 | if (rcu_try_advance_all_cbs()) |
| 1777 | for_each_rcu_flavor(rsp) { | 1788 | invoke_rcu_core(); |
| 1778 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 1779 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | ||
| 1780 | invoke_rcu_core(); | ||
| 1781 | } | ||
| 1782 | } | 1789 | } |
| 1783 | 1790 | ||
| 1784 | /* | 1791 | /* |
| @@ -2108,15 +2115,22 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
| 2108 | 2115 | ||
| 2109 | /* If we are not being polled and there is a kthread, awaken it ... */ | 2116 | /* If we are not being polled and there is a kthread, awaken it ... */ |
| 2110 | t = ACCESS_ONCE(rdp->nocb_kthread); | 2117 | t = ACCESS_ONCE(rdp->nocb_kthread); |
| 2111 | if (rcu_nocb_poll | !t) | 2118 | if (rcu_nocb_poll || !t) { |
| 2119 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2120 | TPS("WakeNotPoll")); | ||
| 2112 | return; | 2121 | return; |
| 2122 | } | ||
| 2113 | len = atomic_long_read(&rdp->nocb_q_count); | 2123 | len = atomic_long_read(&rdp->nocb_q_count); |
| 2114 | if (old_rhpp == &rdp->nocb_head) { | 2124 | if (old_rhpp == &rdp->nocb_head) { |
| 2115 | wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ | 2125 | wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ |
| 2116 | rdp->qlen_last_fqs_check = 0; | 2126 | rdp->qlen_last_fqs_check = 0; |
| 2127 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); | ||
| 2117 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { | 2128 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { |
| 2118 | wake_up_process(t); /* ... or if many callbacks queued. */ | 2129 | wake_up_process(t); /* ... or if many callbacks queued. */ |
| 2119 | rdp->qlen_last_fqs_check = LONG_MAX / 2; | 2130 | rdp->qlen_last_fqs_check = LONG_MAX / 2; |
| 2131 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); | ||
| 2132 | } else { | ||
| 2133 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); | ||
| 2120 | } | 2134 | } |
| 2121 | return; | 2135 | return; |
| 2122 | } | 2136 | } |
| @@ -2140,10 +2154,12 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | |||
| 2140 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) | 2154 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) |
| 2141 | trace_rcu_kfree_callback(rdp->rsp->name, rhp, | 2155 | trace_rcu_kfree_callback(rdp->rsp->name, rhp, |
| 2142 | (unsigned long)rhp->func, | 2156 | (unsigned long)rhp->func, |
| 2143 | rdp->qlen_lazy, rdp->qlen); | 2157 | -atomic_long_read(&rdp->nocb_q_count_lazy), |
| 2158 | -atomic_long_read(&rdp->nocb_q_count)); | ||
| 2144 | else | 2159 | else |
| 2145 | trace_rcu_callback(rdp->rsp->name, rhp, | 2160 | trace_rcu_callback(rdp->rsp->name, rhp, |
| 2146 | rdp->qlen_lazy, rdp->qlen); | 2161 | -atomic_long_read(&rdp->nocb_q_count_lazy), |
| 2162 | -atomic_long_read(&rdp->nocb_q_count)); | ||
| 2147 | return 1; | 2163 | return 1; |
| 2148 | } | 2164 | } |
| 2149 | 2165 | ||
| @@ -2221,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
| 2221 | static int rcu_nocb_kthread(void *arg) | 2237 | static int rcu_nocb_kthread(void *arg) |
| 2222 | { | 2238 | { |
| 2223 | int c, cl; | 2239 | int c, cl; |
| 2240 | bool firsttime = 1; | ||
| 2224 | struct rcu_head *list; | 2241 | struct rcu_head *list; |
| 2225 | struct rcu_head *next; | 2242 | struct rcu_head *next; |
| 2226 | struct rcu_head **tail; | 2243 | struct rcu_head **tail; |
| @@ -2229,14 +2246,27 @@ static int rcu_nocb_kthread(void *arg) | |||
| 2229 | /* Each pass through this loop invokes one batch of callbacks */ | 2246 | /* Each pass through this loop invokes one batch of callbacks */ |
| 2230 | for (;;) { | 2247 | for (;;) { |
| 2231 | /* If not polling, wait for next batch of callbacks. */ | 2248 | /* If not polling, wait for next batch of callbacks. */ |
| 2232 | if (!rcu_nocb_poll) | 2249 | if (!rcu_nocb_poll) { |
| 2250 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2251 | TPS("Sleep")); | ||
| 2233 | wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); | 2252 | wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); |
| 2253 | } else if (firsttime) { | ||
| 2254 | firsttime = 0; | ||
| 2255 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2256 | TPS("Poll")); | ||
| 2257 | } | ||
| 2234 | list = ACCESS_ONCE(rdp->nocb_head); | 2258 | list = ACCESS_ONCE(rdp->nocb_head); |
| 2235 | if (!list) { | 2259 | if (!list) { |
| 2260 | if (!rcu_nocb_poll) | ||
| 2261 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2262 | TPS("WokeEmpty")); | ||
| 2236 | schedule_timeout_interruptible(1); | 2263 | schedule_timeout_interruptible(1); |
| 2237 | flush_signals(current); | 2264 | flush_signals(current); |
| 2238 | continue; | 2265 | continue; |
| 2239 | } | 2266 | } |
| 2267 | firsttime = 1; | ||
| 2268 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2269 | TPS("WokeNonEmpty")); | ||
| 2240 | 2270 | ||
| 2241 | /* | 2271 | /* |
| 2242 | * Extract queued callbacks, update counts, and wait | 2272 | * Extract queued callbacks, update counts, and wait |
| @@ -2257,7 +2287,11 @@ static int rcu_nocb_kthread(void *arg) | |||
| 2257 | next = list->next; | 2287 | next = list->next; |
| 2258 | /* Wait for enqueuing to complete, if needed. */ | 2288 | /* Wait for enqueuing to complete, if needed. */ |
| 2259 | while (next == NULL && &list->next != tail) { | 2289 | while (next == NULL && &list->next != tail) { |
| 2290 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2291 | TPS("WaitQueue")); | ||
| 2260 | schedule_timeout_interruptible(1); | 2292 | schedule_timeout_interruptible(1); |
| 2293 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2294 | TPS("WokeQueue")); | ||
| 2261 | next = list->next; | 2295 | next = list->next; |
| 2262 | } | 2296 | } |
| 2263 | debug_rcu_head_unqueue(list); | 2297 | debug_rcu_head_unqueue(list); |
diff --git a/kernel/rcutree_trace.c b/kernel/rcu/tree_trace.c index cf6c17412932..3596797b7e46 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcu/tree_trace.c | |||
| @@ -44,7 +44,7 @@ | |||
| 44 | #include <linux/seq_file.h> | 44 | #include <linux/seq_file.h> |
| 45 | 45 | ||
| 46 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
| 47 | #include "rcutree.h" | 47 | #include "tree.h" |
| 48 | 48 | ||
| 49 | static int r_open(struct inode *inode, struct file *file, | 49 | static int r_open(struct inode *inode, struct file *file, |
| 50 | const struct seq_operations *op) | 50 | const struct seq_operations *op) |
diff --git a/kernel/rcupdate.c b/kernel/rcu/update.c index b02a339836b4..6cb3dff89e2b 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcu/update.c | |||
| @@ -53,6 +53,12 @@ | |||
| 53 | 53 | ||
| 54 | #include "rcu.h" | 54 | #include "rcu.h" |
| 55 | 55 | ||
| 56 | MODULE_ALIAS("rcupdate"); | ||
| 57 | #ifdef MODULE_PARAM_PREFIX | ||
| 58 | #undef MODULE_PARAM_PREFIX | ||
| 59 | #endif | ||
| 60 | #define MODULE_PARAM_PREFIX "rcupdate." | ||
| 61 | |||
| 56 | module_param(rcu_expedited, int, 0); | 62 | module_param(rcu_expedited, int, 0); |
| 57 | 63 | ||
| 58 | #ifdef CONFIG_PREEMPT_RCU | 64 | #ifdef CONFIG_PREEMPT_RCU |
| @@ -148,7 +154,7 @@ int rcu_read_lock_bh_held(void) | |||
| 148 | { | 154 | { |
| 149 | if (!debug_lockdep_rcu_enabled()) | 155 | if (!debug_lockdep_rcu_enabled()) |
| 150 | return 1; | 156 | return 1; |
| 151 | if (rcu_is_cpu_idle()) | 157 | if (!rcu_is_watching()) |
| 152 | return 0; | 158 | return 0; |
| 153 | if (!rcu_lockdep_current_cpu_online()) | 159 | if (!rcu_lockdep_current_cpu_online()) |
| 154 | return 0; | 160 | return 0; |
| @@ -298,7 +304,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); | |||
| 298 | #endif | 304 | #endif |
| 299 | 305 | ||
| 300 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ | 306 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ |
| 301 | int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; | 307 | static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; |
| 302 | 308 | ||
| 303 | module_param(rcu_cpu_stall_suppress, int, 0644); | 309 | module_param(rcu_cpu_stall_suppress, int, 0644); |
| 304 | module_param(rcu_cpu_stall_timeout, int, 0644); | 310 | module_param(rcu_cpu_stall_timeout, int, 0644); |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 54adcf35f495..7b621409cf15 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
| @@ -12,6 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | |||
| 12 | endif | 12 | endif |
| 13 | 13 | ||
| 14 | obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o | 14 | obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o |
| 15 | obj-y += wait.o completion.o | ||
| 15 | obj-$(CONFIG_SMP) += cpupri.o | 16 | obj-$(CONFIG_SMP) += cpupri.o |
| 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 17 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
| 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 18 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c new file mode 100644 index 000000000000..a63f4dc27909 --- /dev/null +++ b/kernel/sched/completion.c | |||
| @@ -0,0 +1,299 @@ | |||
| 1 | /* | ||
| 2 | * Generic wait-for-completion handler; | ||
| 3 | * | ||
| 4 | * It differs from semaphores in that their default case is the opposite, | ||
| 5 | * wait_for_completion default blocks whereas semaphore default non-block. The | ||
| 6 | * interface also makes it easy to 'complete' multiple waiting threads, | ||
| 7 | * something which isn't entirely natural for semaphores. | ||
| 8 | * | ||
| 9 | * But more importantly, the primitive documents the usage. Semaphores would | ||
| 10 | * typically be used for exclusion which gives rise to priority inversion. | ||
| 11 | * Waiting for completion is a typically sync point, but not an exclusion point. | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <linux/sched.h> | ||
| 15 | #include <linux/completion.h> | ||
| 16 | |||
| 17 | /** | ||
| 18 | * complete: - signals a single thread waiting on this completion | ||
| 19 | * @x: holds the state of this particular completion | ||
| 20 | * | ||
| 21 | * This will wake up a single thread waiting on this completion. Threads will be | ||
| 22 | * awakened in the same order in which they were queued. | ||
| 23 | * | ||
| 24 | * See also complete_all(), wait_for_completion() and related routines. | ||
| 25 | * | ||
| 26 | * It may be assumed that this function implies a write memory barrier before | ||
| 27 | * changing the task state if and only if any tasks are woken up. | ||
| 28 | */ | ||
| 29 | void complete(struct completion *x) | ||
| 30 | { | ||
| 31 | unsigned long flags; | ||
| 32 | |||
| 33 | spin_lock_irqsave(&x->wait.lock, flags); | ||
| 34 | x->done++; | ||
| 35 | __wake_up_locked(&x->wait, TASK_NORMAL, 1); | ||
| 36 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
| 37 | } | ||
| 38 | EXPORT_SYMBOL(complete); | ||
| 39 | |||
| 40 | /** | ||
| 41 | * complete_all: - signals all threads waiting on this completion | ||
| 42 | * @x: holds the state of this particular completion | ||
| 43 | * | ||
| 44 | * This will wake up all threads waiting on this particular completion event. | ||
| 45 | * | ||
| 46 | * It may be assumed that this function implies a write memory barrier before | ||
| 47 | * changing the task state if and only if any tasks are woken up. | ||
| 48 | */ | ||
| 49 | void complete_all(struct completion *x) | ||
| 50 | { | ||
| 51 | unsigned long flags; | ||
| 52 | |||
| 53 | spin_lock_irqsave(&x->wait.lock, flags); | ||
| 54 | x->done += UINT_MAX/2; | ||
| 55 | __wake_up_locked(&x->wait, TASK_NORMAL, 0); | ||
| 56 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
| 57 | } | ||
| 58 | EXPORT_SYMBOL(complete_all); | ||
| 59 | |||
| 60 | static inline long __sched | ||
| 61 | do_wait_for_common(struct completion *x, | ||
| 62 | long (*action)(long), long timeout, int state) | ||
| 63 | { | ||
| 64 | if (!x->done) { | ||
| 65 | DECLARE_WAITQUEUE(wait, current); | ||
| 66 | |||
| 67 | __add_wait_queue_tail_exclusive(&x->wait, &wait); | ||
| 68 | do { | ||
| 69 | if (signal_pending_state(state, current)) { | ||
| 70 | timeout = -ERESTARTSYS; | ||
| 71 | break; | ||
| 72 | } | ||
| 73 | __set_current_state(state); | ||
| 74 | spin_unlock_irq(&x->wait.lock); | ||
| 75 | timeout = action(timeout); | ||
| 76 | spin_lock_irq(&x->wait.lock); | ||
| 77 | } while (!x->done && timeout); | ||
| 78 | __remove_wait_queue(&x->wait, &wait); | ||
| 79 | if (!x->done) | ||
| 80 | return timeout; | ||
| 81 | } | ||
| 82 | x->done--; | ||
| 83 | return timeout ?: 1; | ||
| 84 | } | ||
| 85 | |||
| 86 | static inline long __sched | ||
| 87 | __wait_for_common(struct completion *x, | ||
| 88 | long (*action)(long), long timeout, int state) | ||
| 89 | { | ||
| 90 | might_sleep(); | ||
| 91 | |||
| 92 | spin_lock_irq(&x->wait.lock); | ||
| 93 | timeout = do_wait_for_common(x, action, timeout, state); | ||
| 94 | spin_unlock_irq(&x->wait.lock); | ||
| 95 | return timeout; | ||
| 96 | } | ||
| 97 | |||
| 98 | static long __sched | ||
| 99 | wait_for_common(struct completion *x, long timeout, int state) | ||
| 100 | { | ||
| 101 | return __wait_for_common(x, schedule_timeout, timeout, state); | ||
| 102 | } | ||
| 103 | |||
| 104 | static long __sched | ||
| 105 | wait_for_common_io(struct completion *x, long timeout, int state) | ||
| 106 | { | ||
| 107 | return __wait_for_common(x, io_schedule_timeout, timeout, state); | ||
| 108 | } | ||
| 109 | |||
| 110 | /** | ||
| 111 | * wait_for_completion: - waits for completion of a task | ||
| 112 | * @x: holds the state of this particular completion | ||
| 113 | * | ||
| 114 | * This waits to be signaled for completion of a specific task. It is NOT | ||
| 115 | * interruptible and there is no timeout. | ||
| 116 | * | ||
| 117 | * See also similar routines (i.e. wait_for_completion_timeout()) with timeout | ||
| 118 | * and interrupt capability. Also see complete(). | ||
| 119 | */ | ||
| 120 | void __sched wait_for_completion(struct completion *x) | ||
| 121 | { | ||
| 122 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
| 123 | } | ||
| 124 | EXPORT_SYMBOL(wait_for_completion); | ||
| 125 | |||
| 126 | /** | ||
| 127 | * wait_for_completion_timeout: - waits for completion of a task (w/timeout) | ||
| 128 | * @x: holds the state of this particular completion | ||
| 129 | * @timeout: timeout value in jiffies | ||
| 130 | * | ||
| 131 | * This waits for either a completion of a specific task to be signaled or for a | ||
| 132 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
| 133 | * interruptible. | ||
| 134 | * | ||
| 135 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left | ||
| 136 | * till timeout) if completed. | ||
| 137 | */ | ||
| 138 | unsigned long __sched | ||
| 139 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | ||
| 140 | { | ||
| 141 | return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); | ||
| 142 | } | ||
| 143 | EXPORT_SYMBOL(wait_for_completion_timeout); | ||
| 144 | |||
| 145 | /** | ||
| 146 | * wait_for_completion_io: - waits for completion of a task | ||
| 147 | * @x: holds the state of this particular completion | ||
| 148 | * | ||
| 149 | * This waits to be signaled for completion of a specific task. It is NOT | ||
| 150 | * interruptible and there is no timeout. The caller is accounted as waiting | ||
| 151 | * for IO. | ||
| 152 | */ | ||
| 153 | void __sched wait_for_completion_io(struct completion *x) | ||
| 154 | { | ||
| 155 | wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
| 156 | } | ||
| 157 | EXPORT_SYMBOL(wait_for_completion_io); | ||
| 158 | |||
| 159 | /** | ||
| 160 | * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) | ||
| 161 | * @x: holds the state of this particular completion | ||
| 162 | * @timeout: timeout value in jiffies | ||
| 163 | * | ||
| 164 | * This waits for either a completion of a specific task to be signaled or for a | ||
| 165 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
| 166 | * interruptible. The caller is accounted as waiting for IO. | ||
| 167 | * | ||
| 168 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left | ||
| 169 | * till timeout) if completed. | ||
| 170 | */ | ||
| 171 | unsigned long __sched | ||
| 172 | wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) | ||
| 173 | { | ||
| 174 | return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); | ||
| 175 | } | ||
| 176 | EXPORT_SYMBOL(wait_for_completion_io_timeout); | ||
| 177 | |||
| 178 | /** | ||
| 179 | * wait_for_completion_interruptible: - waits for completion of a task (w/intr) | ||
| 180 | * @x: holds the state of this particular completion | ||
| 181 | * | ||
| 182 | * This waits for completion of a specific task to be signaled. It is | ||
| 183 | * interruptible. | ||
| 184 | * | ||
| 185 | * Return: -ERESTARTSYS if interrupted, 0 if completed. | ||
| 186 | */ | ||
| 187 | int __sched wait_for_completion_interruptible(struct completion *x) | ||
| 188 | { | ||
| 189 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); | ||
| 190 | if (t == -ERESTARTSYS) | ||
| 191 | return t; | ||
| 192 | return 0; | ||
| 193 | } | ||
| 194 | EXPORT_SYMBOL(wait_for_completion_interruptible); | ||
| 195 | |||
| 196 | /** | ||
| 197 | * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) | ||
| 198 | * @x: holds the state of this particular completion | ||
| 199 | * @timeout: timeout value in jiffies | ||
| 200 | * | ||
| 201 | * This waits for either a completion of a specific task to be signaled or for a | ||
| 202 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | ||
| 203 | * | ||
| 204 | * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, | ||
| 205 | * or number of jiffies left till timeout) if completed. | ||
| 206 | */ | ||
| 207 | long __sched | ||
| 208 | wait_for_completion_interruptible_timeout(struct completion *x, | ||
| 209 | unsigned long timeout) | ||
| 210 | { | ||
| 211 | return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); | ||
| 212 | } | ||
| 213 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | ||
| 214 | |||
| 215 | /** | ||
| 216 | * wait_for_completion_killable: - waits for completion of a task (killable) | ||
| 217 | * @x: holds the state of this particular completion | ||
| 218 | * | ||
| 219 | * This waits to be signaled for completion of a specific task. It can be | ||
| 220 | * interrupted by a kill signal. | ||
| 221 | * | ||
| 222 | * Return: -ERESTARTSYS if interrupted, 0 if completed. | ||
| 223 | */ | ||
| 224 | int __sched wait_for_completion_killable(struct completion *x) | ||
| 225 | { | ||
| 226 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); | ||
| 227 | if (t == -ERESTARTSYS) | ||
| 228 | return t; | ||
| 229 | return 0; | ||
| 230 | } | ||
| 231 | EXPORT_SYMBOL(wait_for_completion_killable); | ||
| 232 | |||
| 233 | /** | ||
| 234 | * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) | ||
| 235 | * @x: holds the state of this particular completion | ||
| 236 | * @timeout: timeout value in jiffies | ||
| 237 | * | ||
| 238 | * This waits for either a completion of a specific task to be | ||
| 239 | * signaled or for a specified timeout to expire. It can be | ||
| 240 | * interrupted by a kill signal. The timeout is in jiffies. | ||
| 241 | * | ||
| 242 | * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, | ||
| 243 | * or number of jiffies left till timeout) if completed. | ||
| 244 | */ | ||
| 245 | long __sched | ||
| 246 | wait_for_completion_killable_timeout(struct completion *x, | ||
| 247 | unsigned long timeout) | ||
| 248 | { | ||
| 249 | return wait_for_common(x, timeout, TASK_KILLABLE); | ||
| 250 | } | ||
| 251 | EXPORT_SYMBOL(wait_for_completion_killable_timeout); | ||
| 252 | |||
| 253 | /** | ||
| 254 | * try_wait_for_completion - try to decrement a completion without blocking | ||
| 255 | * @x: completion structure | ||
| 256 | * | ||
| 257 | * Return: 0 if a decrement cannot be done without blocking | ||
| 258 | * 1 if a decrement succeeded. | ||
| 259 | * | ||
| 260 | * If a completion is being used as a counting completion, | ||
| 261 | * attempt to decrement the counter without blocking. This | ||
| 262 | * enables us to avoid waiting if the resource the completion | ||
| 263 | * is protecting is not available. | ||
| 264 | */ | ||
| 265 | bool try_wait_for_completion(struct completion *x) | ||
| 266 | { | ||
| 267 | unsigned long flags; | ||
| 268 | int ret = 1; | ||
| 269 | |||
| 270 | spin_lock_irqsave(&x->wait.lock, flags); | ||
| 271 | if (!x->done) | ||
| 272 | ret = 0; | ||
| 273 | else | ||
| 274 | x->done--; | ||
| 275 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
| 276 | return ret; | ||
| 277 | } | ||
| 278 | EXPORT_SYMBOL(try_wait_for_completion); | ||
| 279 | |||
| 280 | /** | ||
| 281 | * completion_done - Test to see if a completion has any waiters | ||
| 282 | * @x: completion structure | ||
| 283 | * | ||
| 284 | * Return: 0 if there are waiters (wait_for_completion() in progress) | ||
| 285 | * 1 if there are no waiters. | ||
| 286 | * | ||
| 287 | */ | ||
| 288 | bool completion_done(struct completion *x) | ||
| 289 | { | ||
| 290 | unsigned long flags; | ||
| 291 | int ret = 1; | ||
| 292 | |||
| 293 | spin_lock_irqsave(&x->wait.lock, flags); | ||
| 294 | if (!x->done) | ||
| 295 | ret = 0; | ||
| 296 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
| 297 | return ret; | ||
| 298 | } | ||
| 299 | EXPORT_SYMBOL(completion_done); | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5ac63c9a995a..1deccd78be98 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -513,12 +513,11 @@ static inline void init_hrtick(void) | |||
| 513 | * might also involve a cross-CPU call to trigger the scheduler on | 513 | * might also involve a cross-CPU call to trigger the scheduler on |
| 514 | * the target CPU. | 514 | * the target CPU. |
| 515 | */ | 515 | */ |
| 516 | #ifdef CONFIG_SMP | ||
| 517 | void resched_task(struct task_struct *p) | 516 | void resched_task(struct task_struct *p) |
| 518 | { | 517 | { |
| 519 | int cpu; | 518 | int cpu; |
| 520 | 519 | ||
| 521 | assert_raw_spin_locked(&task_rq(p)->lock); | 520 | lockdep_assert_held(&task_rq(p)->lock); |
| 522 | 521 | ||
| 523 | if (test_tsk_need_resched(p)) | 522 | if (test_tsk_need_resched(p)) |
| 524 | return; | 523 | return; |
| @@ -526,8 +525,10 @@ void resched_task(struct task_struct *p) | |||
| 526 | set_tsk_need_resched(p); | 525 | set_tsk_need_resched(p); |
| 527 | 526 | ||
| 528 | cpu = task_cpu(p); | 527 | cpu = task_cpu(p); |
| 529 | if (cpu == smp_processor_id()) | 528 | if (cpu == smp_processor_id()) { |
| 529 | set_preempt_need_resched(); | ||
| 530 | return; | 530 | return; |
| 531 | } | ||
| 531 | 532 | ||
| 532 | /* NEED_RESCHED must be visible before we test polling */ | 533 | /* NEED_RESCHED must be visible before we test polling */ |
| 533 | smp_mb(); | 534 | smp_mb(); |
| @@ -546,6 +547,7 @@ void resched_cpu(int cpu) | |||
| 546 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 547 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
| 547 | } | 548 | } |
| 548 | 549 | ||
| 550 | #ifdef CONFIG_SMP | ||
| 549 | #ifdef CONFIG_NO_HZ_COMMON | 551 | #ifdef CONFIG_NO_HZ_COMMON |
| 550 | /* | 552 | /* |
| 551 | * In the semi idle case, use the nearest busy cpu for migrating timers | 553 | * In the semi idle case, use the nearest busy cpu for migrating timers |
| @@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq) | |||
| 693 | } | 695 | } |
| 694 | } | 696 | } |
| 695 | 697 | ||
| 696 | #else /* !CONFIG_SMP */ | ||
| 697 | void resched_task(struct task_struct *p) | ||
| 698 | { | ||
| 699 | assert_raw_spin_locked(&task_rq(p)->lock); | ||
| 700 | set_tsk_need_resched(p); | ||
| 701 | } | ||
| 702 | #endif /* CONFIG_SMP */ | 698 | #endif /* CONFIG_SMP */ |
| 703 | 699 | ||
| 704 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ | 700 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ |
| @@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p) | |||
| 767 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 763 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
| 768 | { | 764 | { |
| 769 | update_rq_clock(rq); | 765 | update_rq_clock(rq); |
| 770 | sched_info_queued(p); | 766 | sched_info_queued(rq, p); |
| 771 | p->sched_class->enqueue_task(rq, p, flags); | 767 | p->sched_class->enqueue_task(rq, p, flags); |
| 772 | } | 768 | } |
| 773 | 769 | ||
| 774 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 770 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
| 775 | { | 771 | { |
| 776 | update_rq_clock(rq); | 772 | update_rq_clock(rq); |
| 777 | sched_info_dequeued(p); | 773 | sched_info_dequeued(rq, p); |
| 778 | p->sched_class->dequeue_task(rq, p, flags); | 774 | p->sched_class->dequeue_task(rq, p, flags); |
| 779 | } | 775 | } |
| 780 | 776 | ||
| @@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 987 | * ttwu() will sort out the placement. | 983 | * ttwu() will sort out the placement. |
| 988 | */ | 984 | */ |
| 989 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 985 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
| 990 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); | 986 | !(task_preempt_count(p) & PREEMPT_ACTIVE)); |
| 991 | 987 | ||
| 992 | #ifdef CONFIG_LOCKDEP | 988 | #ifdef CONFIG_LOCKDEP |
| 993 | /* | 989 | /* |
| @@ -1017,6 +1013,107 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1017 | __set_task_cpu(p, new_cpu); | 1013 | __set_task_cpu(p, new_cpu); |
| 1018 | } | 1014 | } |
| 1019 | 1015 | ||
| 1016 | static void __migrate_swap_task(struct task_struct *p, int cpu) | ||
| 1017 | { | ||
| 1018 | if (p->on_rq) { | ||
| 1019 | struct rq *src_rq, *dst_rq; | ||
| 1020 | |||
| 1021 | src_rq = task_rq(p); | ||
| 1022 | dst_rq = cpu_rq(cpu); | ||
| 1023 | |||
| 1024 | deactivate_task(src_rq, p, 0); | ||
| 1025 | set_task_cpu(p, cpu); | ||
| 1026 | activate_task(dst_rq, p, 0); | ||
| 1027 | check_preempt_curr(dst_rq, p, 0); | ||
| 1028 | } else { | ||
| 1029 | /* | ||
| 1030 | * Task isn't running anymore; make it appear like we migrated | ||
| 1031 | * it before it went to sleep. This means on wakeup we make the | ||
| 1032 | * previous cpu our targer instead of where it really is. | ||
| 1033 | */ | ||
| 1034 | p->wake_cpu = cpu; | ||
| 1035 | } | ||
| 1036 | } | ||
| 1037 | |||
| 1038 | struct migration_swap_arg { | ||
| 1039 | struct task_struct *src_task, *dst_task; | ||
| 1040 | int src_cpu, dst_cpu; | ||
| 1041 | }; | ||
| 1042 | |||
| 1043 | static int migrate_swap_stop(void *data) | ||
| 1044 | { | ||
| 1045 | struct migration_swap_arg *arg = data; | ||
| 1046 | struct rq *src_rq, *dst_rq; | ||
| 1047 | int ret = -EAGAIN; | ||
| 1048 | |||
| 1049 | src_rq = cpu_rq(arg->src_cpu); | ||
| 1050 | dst_rq = cpu_rq(arg->dst_cpu); | ||
| 1051 | |||
| 1052 | double_raw_lock(&arg->src_task->pi_lock, | ||
| 1053 | &arg->dst_task->pi_lock); | ||
| 1054 | double_rq_lock(src_rq, dst_rq); | ||
| 1055 | if (task_cpu(arg->dst_task) != arg->dst_cpu) | ||
| 1056 | goto unlock; | ||
| 1057 | |||
| 1058 | if (task_cpu(arg->src_task) != arg->src_cpu) | ||
| 1059 | goto unlock; | ||
| 1060 | |||
| 1061 | if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) | ||
| 1062 | goto unlock; | ||
| 1063 | |||
| 1064 | if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) | ||
| 1065 | goto unlock; | ||
| 1066 | |||
| 1067 | __migrate_swap_task(arg->src_task, arg->dst_cpu); | ||
| 1068 | __migrate_swap_task(arg->dst_task, arg->src_cpu); | ||
| 1069 | |||
| 1070 | ret = 0; | ||
| 1071 | |||
| 1072 | unlock: | ||
| 1073 | double_rq_unlock(src_rq, dst_rq); | ||
| 1074 | raw_spin_unlock(&arg->dst_task->pi_lock); | ||
| 1075 | raw_spin_unlock(&arg->src_task->pi_lock); | ||
| 1076 | |||
| 1077 | return ret; | ||
| 1078 | } | ||
| 1079 | |||
| 1080 | /* | ||
| 1081 | * Cross migrate two tasks | ||
| 1082 | */ | ||
| 1083 | int migrate_swap(struct task_struct *cur, struct task_struct *p) | ||
| 1084 | { | ||
| 1085 | struct migration_swap_arg arg; | ||
| 1086 | int ret = -EINVAL; | ||
| 1087 | |||
| 1088 | arg = (struct migration_swap_arg){ | ||
| 1089 | .src_task = cur, | ||
| 1090 | .src_cpu = task_cpu(cur), | ||
| 1091 | .dst_task = p, | ||
| 1092 | .dst_cpu = task_cpu(p), | ||
| 1093 | }; | ||
| 1094 | |||
| 1095 | if (arg.src_cpu == arg.dst_cpu) | ||
| 1096 | goto out; | ||
| 1097 | |||
| 1098 | /* | ||
| 1099 | * These three tests are all lockless; this is OK since all of them | ||
| 1100 | * will be re-checked with proper locks held further down the line. | ||
| 1101 | */ | ||
| 1102 | if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) | ||
| 1103 | goto out; | ||
| 1104 | |||
| 1105 | if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) | ||
| 1106 | goto out; | ||
| 1107 | |||
| 1108 | if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) | ||
| 1109 | goto out; | ||
| 1110 | |||
| 1111 | ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); | ||
| 1112 | |||
| 1113 | out: | ||
| 1114 | return ret; | ||
| 1115 | } | ||
| 1116 | |||
| 1020 | struct migration_arg { | 1117 | struct migration_arg { |
| 1021 | struct task_struct *task; | 1118 | struct task_struct *task; |
| 1022 | int dest_cpu; | 1119 | int dest_cpu; |
| @@ -1236,9 +1333,9 @@ out: | |||
| 1236 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. | 1333 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. |
| 1237 | */ | 1334 | */ |
| 1238 | static inline | 1335 | static inline |
| 1239 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | 1336 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) |
| 1240 | { | 1337 | { |
| 1241 | int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); | 1338 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); |
| 1242 | 1339 | ||
| 1243 | /* | 1340 | /* |
| 1244 | * In order not to call set_task_cpu() on a blocking task we need | 1341 | * In order not to call set_task_cpu() on a blocking task we need |
| @@ -1330,12 +1427,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | |||
| 1330 | 1427 | ||
| 1331 | if (rq->idle_stamp) { | 1428 | if (rq->idle_stamp) { |
| 1332 | u64 delta = rq_clock(rq) - rq->idle_stamp; | 1429 | u64 delta = rq_clock(rq) - rq->idle_stamp; |
| 1333 | u64 max = 2*sysctl_sched_migration_cost; | 1430 | u64 max = 2*rq->max_idle_balance_cost; |
| 1431 | |||
| 1432 | update_avg(&rq->avg_idle, delta); | ||
| 1334 | 1433 | ||
| 1335 | if (delta > max) | 1434 | if (rq->avg_idle > max) |
| 1336 | rq->avg_idle = max; | 1435 | rq->avg_idle = max; |
| 1337 | else | 1436 | |
| 1338 | update_avg(&rq->avg_idle, delta); | ||
| 1339 | rq->idle_stamp = 0; | 1437 | rq->idle_stamp = 0; |
| 1340 | } | 1438 | } |
| 1341 | #endif | 1439 | #endif |
| @@ -1396,6 +1494,14 @@ static void sched_ttwu_pending(void) | |||
| 1396 | 1494 | ||
| 1397 | void scheduler_ipi(void) | 1495 | void scheduler_ipi(void) |
| 1398 | { | 1496 | { |
| 1497 | /* | ||
| 1498 | * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting | ||
| 1499 | * TIF_NEED_RESCHED remotely (for the first time) will also send | ||
| 1500 | * this IPI. | ||
| 1501 | */ | ||
| 1502 | if (tif_need_resched()) | ||
| 1503 | set_preempt_need_resched(); | ||
| 1504 | |||
| 1399 | if (llist_empty(&this_rq()->wake_list) | 1505 | if (llist_empty(&this_rq()->wake_list) |
| 1400 | && !tick_nohz_full_cpu(smp_processor_id()) | 1506 | && !tick_nohz_full_cpu(smp_processor_id()) |
| 1401 | && !got_nohz_idle_kick()) | 1507 | && !got_nohz_idle_kick()) |
| @@ -1513,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
| 1513 | if (p->sched_class->task_waking) | 1619 | if (p->sched_class->task_waking) |
| 1514 | p->sched_class->task_waking(p); | 1620 | p->sched_class->task_waking(p); |
| 1515 | 1621 | ||
| 1516 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | 1622 | cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); |
| 1517 | if (task_cpu(p) != cpu) { | 1623 | if (task_cpu(p) != cpu) { |
| 1518 | wake_flags |= WF_MIGRATED; | 1624 | wake_flags |= WF_MIGRATED; |
| 1519 | set_task_cpu(p, cpu); | 1625 | set_task_cpu(p, cpu); |
| @@ -1595,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
| 1595 | * | 1701 | * |
| 1596 | * __sched_fork() is basic setup used by init_idle() too: | 1702 | * __sched_fork() is basic setup used by init_idle() too: |
| 1597 | */ | 1703 | */ |
| 1598 | static void __sched_fork(struct task_struct *p) | 1704 | static void __sched_fork(unsigned long clone_flags, struct task_struct *p) |
| 1599 | { | 1705 | { |
| 1600 | p->on_rq = 0; | 1706 | p->on_rq = 0; |
| 1601 | 1707 | ||
| @@ -1619,16 +1725,24 @@ static void __sched_fork(struct task_struct *p) | |||
| 1619 | 1725 | ||
| 1620 | #ifdef CONFIG_NUMA_BALANCING | 1726 | #ifdef CONFIG_NUMA_BALANCING |
| 1621 | if (p->mm && atomic_read(&p->mm->mm_users) == 1) { | 1727 | if (p->mm && atomic_read(&p->mm->mm_users) == 1) { |
| 1622 | p->mm->numa_next_scan = jiffies; | 1728 | p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); |
| 1623 | p->mm->numa_next_reset = jiffies; | ||
| 1624 | p->mm->numa_scan_seq = 0; | 1729 | p->mm->numa_scan_seq = 0; |
| 1625 | } | 1730 | } |
| 1626 | 1731 | ||
| 1732 | if (clone_flags & CLONE_VM) | ||
| 1733 | p->numa_preferred_nid = current->numa_preferred_nid; | ||
| 1734 | else | ||
| 1735 | p->numa_preferred_nid = -1; | ||
| 1736 | |||
| 1627 | p->node_stamp = 0ULL; | 1737 | p->node_stamp = 0ULL; |
| 1628 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; | 1738 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; |
| 1629 | p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; | ||
| 1630 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; | 1739 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; |
| 1631 | p->numa_work.next = &p->numa_work; | 1740 | p->numa_work.next = &p->numa_work; |
| 1741 | p->numa_faults = NULL; | ||
| 1742 | p->numa_faults_buffer = NULL; | ||
| 1743 | |||
| 1744 | INIT_LIST_HEAD(&p->numa_entry); | ||
| 1745 | p->numa_group = NULL; | ||
| 1632 | #endif /* CONFIG_NUMA_BALANCING */ | 1746 | #endif /* CONFIG_NUMA_BALANCING */ |
| 1633 | } | 1747 | } |
| 1634 | 1748 | ||
| @@ -1654,12 +1768,12 @@ void set_numabalancing_state(bool enabled) | |||
| 1654 | /* | 1768 | /* |
| 1655 | * fork()/clone()-time setup: | 1769 | * fork()/clone()-time setup: |
| 1656 | */ | 1770 | */ |
| 1657 | void sched_fork(struct task_struct *p) | 1771 | void sched_fork(unsigned long clone_flags, struct task_struct *p) |
| 1658 | { | 1772 | { |
| 1659 | unsigned long flags; | 1773 | unsigned long flags; |
| 1660 | int cpu = get_cpu(); | 1774 | int cpu = get_cpu(); |
| 1661 | 1775 | ||
| 1662 | __sched_fork(p); | 1776 | __sched_fork(clone_flags, p); |
| 1663 | /* | 1777 | /* |
| 1664 | * We mark the process as running here. This guarantees that | 1778 | * We mark the process as running here. This guarantees that |
| 1665 | * nobody will actually run it, and a signal or other external | 1779 | * nobody will actually run it, and a signal or other external |
| @@ -1717,10 +1831,7 @@ void sched_fork(struct task_struct *p) | |||
| 1717 | #if defined(CONFIG_SMP) | 1831 | #if defined(CONFIG_SMP) |
| 1718 | p->on_cpu = 0; | 1832 | p->on_cpu = 0; |
| 1719 | #endif | 1833 | #endif |
| 1720 | #ifdef CONFIG_PREEMPT_COUNT | 1834 | init_task_preempt_count(p); |
| 1721 | /* Want to start with kernel preemption disabled. */ | ||
| 1722 | task_thread_info(p)->preempt_count = 1; | ||
| 1723 | #endif | ||
| 1724 | #ifdef CONFIG_SMP | 1835 | #ifdef CONFIG_SMP |
| 1725 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 1836 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
| 1726 | #endif | 1837 | #endif |
| @@ -1747,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p) | |||
| 1747 | * - cpus_allowed can change in the fork path | 1858 | * - cpus_allowed can change in the fork path |
| 1748 | * - any previously selected cpu might disappear through hotplug | 1859 | * - any previously selected cpu might disappear through hotplug |
| 1749 | */ | 1860 | */ |
| 1750 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); | 1861 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); |
| 1751 | #endif | 1862 | #endif |
| 1752 | 1863 | ||
| 1753 | /* Initialize new task's runnable average */ | 1864 | /* Initialize new task's runnable average */ |
| @@ -1838,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
| 1838 | struct task_struct *next) | 1949 | struct task_struct *next) |
| 1839 | { | 1950 | { |
| 1840 | trace_sched_switch(prev, next); | 1951 | trace_sched_switch(prev, next); |
| 1841 | sched_info_switch(prev, next); | 1952 | sched_info_switch(rq, prev, next); |
| 1842 | perf_event_task_sched_out(prev, next); | 1953 | perf_event_task_sched_out(prev, next); |
| 1843 | fire_sched_out_preempt_notifiers(prev, next); | 1954 | fire_sched_out_preempt_notifiers(prev, next); |
| 1844 | prepare_lock_switch(rq, next); | 1955 | prepare_lock_switch(rq, next); |
| @@ -1890,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 1890 | if (mm) | 2001 | if (mm) |
| 1891 | mmdrop(mm); | 2002 | mmdrop(mm); |
| 1892 | if (unlikely(prev_state == TASK_DEAD)) { | 2003 | if (unlikely(prev_state == TASK_DEAD)) { |
| 2004 | task_numa_free(prev); | ||
| 2005 | |||
| 1893 | /* | 2006 | /* |
| 1894 | * Remove function-return probe instances associated with this | 2007 | * Remove function-return probe instances associated with this |
| 1895 | * task and put them back on the free list. | 2008 | * task and put them back on the free list. |
| @@ -2073,7 +2186,7 @@ void sched_exec(void) | |||
| 2073 | int dest_cpu; | 2186 | int dest_cpu; |
| 2074 | 2187 | ||
| 2075 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2188 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
| 2076 | dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); | 2189 | dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); |
| 2077 | if (dest_cpu == smp_processor_id()) | 2190 | if (dest_cpu == smp_processor_id()) |
| 2078 | goto unlock; | 2191 | goto unlock; |
| 2079 | 2192 | ||
| @@ -2215,7 +2328,7 @@ notrace unsigned long get_parent_ip(unsigned long addr) | |||
| 2215 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ | 2328 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
| 2216 | defined(CONFIG_PREEMPT_TRACER)) | 2329 | defined(CONFIG_PREEMPT_TRACER)) |
| 2217 | 2330 | ||
| 2218 | void __kprobes add_preempt_count(int val) | 2331 | void __kprobes preempt_count_add(int val) |
| 2219 | { | 2332 | { |
| 2220 | #ifdef CONFIG_DEBUG_PREEMPT | 2333 | #ifdef CONFIG_DEBUG_PREEMPT |
| 2221 | /* | 2334 | /* |
| @@ -2224,7 +2337,7 @@ void __kprobes add_preempt_count(int val) | |||
| 2224 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) | 2337 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) |
| 2225 | return; | 2338 | return; |
| 2226 | #endif | 2339 | #endif |
| 2227 | preempt_count() += val; | 2340 | __preempt_count_add(val); |
| 2228 | #ifdef CONFIG_DEBUG_PREEMPT | 2341 | #ifdef CONFIG_DEBUG_PREEMPT |
| 2229 | /* | 2342 | /* |
| 2230 | * Spinlock count overflowing soon? | 2343 | * Spinlock count overflowing soon? |
| @@ -2235,9 +2348,9 @@ void __kprobes add_preempt_count(int val) | |||
| 2235 | if (preempt_count() == val) | 2348 | if (preempt_count() == val) |
| 2236 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 2349 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
| 2237 | } | 2350 | } |
| 2238 | EXPORT_SYMBOL(add_preempt_count); | 2351 | EXPORT_SYMBOL(preempt_count_add); |
| 2239 | 2352 | ||
| 2240 | void __kprobes sub_preempt_count(int val) | 2353 | void __kprobes preempt_count_sub(int val) |
| 2241 | { | 2354 | { |
| 2242 | #ifdef CONFIG_DEBUG_PREEMPT | 2355 | #ifdef CONFIG_DEBUG_PREEMPT |
| 2243 | /* | 2356 | /* |
| @@ -2255,9 +2368,9 @@ void __kprobes sub_preempt_count(int val) | |||
| 2255 | 2368 | ||
| 2256 | if (preempt_count() == val) | 2369 | if (preempt_count() == val) |
| 2257 | trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 2370 | trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
| 2258 | preempt_count() -= val; | 2371 | __preempt_count_sub(val); |
| 2259 | } | 2372 | } |
| 2260 | EXPORT_SYMBOL(sub_preempt_count); | 2373 | EXPORT_SYMBOL(preempt_count_sub); |
| 2261 | 2374 | ||
| 2262 | #endif | 2375 | #endif |
| 2263 | 2376 | ||
| @@ -2430,6 +2543,7 @@ need_resched: | |||
| 2430 | put_prev_task(rq, prev); | 2543 | put_prev_task(rq, prev); |
| 2431 | next = pick_next_task(rq); | 2544 | next = pick_next_task(rq); |
| 2432 | clear_tsk_need_resched(prev); | 2545 | clear_tsk_need_resched(prev); |
| 2546 | clear_preempt_need_resched(); | ||
| 2433 | rq->skip_clock_update = 0; | 2547 | rq->skip_clock_update = 0; |
| 2434 | 2548 | ||
| 2435 | if (likely(prev != next)) { | 2549 | if (likely(prev != next)) { |
| @@ -2520,9 +2634,9 @@ asmlinkage void __sched notrace preempt_schedule(void) | |||
| 2520 | return; | 2634 | return; |
| 2521 | 2635 | ||
| 2522 | do { | 2636 | do { |
| 2523 | add_preempt_count_notrace(PREEMPT_ACTIVE); | 2637 | __preempt_count_add(PREEMPT_ACTIVE); |
| 2524 | __schedule(); | 2638 | __schedule(); |
| 2525 | sub_preempt_count_notrace(PREEMPT_ACTIVE); | 2639 | __preempt_count_sub(PREEMPT_ACTIVE); |
| 2526 | 2640 | ||
| 2527 | /* | 2641 | /* |
| 2528 | * Check again in case we missed a preemption opportunity | 2642 | * Check again in case we missed a preemption opportunity |
| @@ -2541,20 +2655,19 @@ EXPORT_SYMBOL(preempt_schedule); | |||
| 2541 | */ | 2655 | */ |
| 2542 | asmlinkage void __sched preempt_schedule_irq(void) | 2656 | asmlinkage void __sched preempt_schedule_irq(void) |
| 2543 | { | 2657 | { |
| 2544 | struct thread_info *ti = current_thread_info(); | ||
| 2545 | enum ctx_state prev_state; | 2658 | enum ctx_state prev_state; |
| 2546 | 2659 | ||
| 2547 | /* Catch callers which need to be fixed */ | 2660 | /* Catch callers which need to be fixed */ |
| 2548 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 2661 | BUG_ON(preempt_count() || !irqs_disabled()); |
| 2549 | 2662 | ||
| 2550 | prev_state = exception_enter(); | 2663 | prev_state = exception_enter(); |
| 2551 | 2664 | ||
| 2552 | do { | 2665 | do { |
| 2553 | add_preempt_count(PREEMPT_ACTIVE); | 2666 | __preempt_count_add(PREEMPT_ACTIVE); |
| 2554 | local_irq_enable(); | 2667 | local_irq_enable(); |
| 2555 | __schedule(); | 2668 | __schedule(); |
| 2556 | local_irq_disable(); | 2669 | local_irq_disable(); |
| 2557 | sub_preempt_count(PREEMPT_ACTIVE); | 2670 | __preempt_count_sub(PREEMPT_ACTIVE); |
| 2558 | 2671 | ||
| 2559 | /* | 2672 | /* |
| 2560 | * Check again in case we missed a preemption opportunity | 2673 | * Check again in case we missed a preemption opportunity |
| @@ -2575,393 +2688,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, | |||
| 2575 | } | 2688 | } |
| 2576 | EXPORT_SYMBOL(default_wake_function); | 2689 | EXPORT_SYMBOL(default_wake_function); |
| 2577 | 2690 | ||
| 2578 | /* | ||
| 2579 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just | ||
| 2580 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve | ||
| 2581 | * number) then we wake all the non-exclusive tasks and one exclusive task. | ||
| 2582 | * | ||
| 2583 | * There are circumstances in which we can try to wake a task which has already | ||
| 2584 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | ||
| 2585 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | ||
| 2586 | */ | ||
| 2587 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | ||
| 2588 | int nr_exclusive, int wake_flags, void *key) | ||
| 2589 | { | ||
| 2590 | wait_queue_t *curr, *next; | ||
| 2591 | |||
| 2592 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { | ||
| 2593 | unsigned flags = curr->flags; | ||
| 2594 | |||
| 2595 | if (curr->func(curr, mode, wake_flags, key) && | ||
| 2596 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | ||
| 2597 | break; | ||
| 2598 | } | ||
| 2599 | } | ||
| 2600 | |||
| 2601 | /** | ||
| 2602 | * __wake_up - wake up threads blocked on a waitqueue. | ||
| 2603 | * @q: the waitqueue | ||
| 2604 | * @mode: which threads | ||
| 2605 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | ||
| 2606 | * @key: is directly passed to the wakeup function | ||
| 2607 | * | ||
| 2608 | * It may be assumed that this function implies a write memory barrier before | ||
| 2609 | * changing the task state if and only if any tasks are woken up. | ||
| 2610 | */ | ||
| 2611 | void __wake_up(wait_queue_head_t *q, unsigned int mode, | ||
| 2612 | int nr_exclusive, void *key) | ||
| 2613 | { | ||
| 2614 | unsigned long flags; | ||
| 2615 | |||
| 2616 | spin_lock_irqsave(&q->lock, flags); | ||
| 2617 | __wake_up_common(q, mode, nr_exclusive, 0, key); | ||
| 2618 | spin_unlock_irqrestore(&q->lock, flags); | ||
| 2619 | } | ||
| 2620 | EXPORT_SYMBOL(__wake_up); | ||
| 2621 | |||
| 2622 | /* | ||
| 2623 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. | ||
| 2624 | */ | ||
| 2625 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) | ||
| 2626 | { | ||
| 2627 | __wake_up_common(q, mode, nr, 0, NULL); | ||
| 2628 | } | ||
| 2629 | EXPORT_SYMBOL_GPL(__wake_up_locked); | ||
| 2630 | |||
| 2631 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | ||
| 2632 | { | ||
| 2633 | __wake_up_common(q, mode, 1, 0, key); | ||
| 2634 | } | ||
| 2635 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | ||
| 2636 | |||
| 2637 | /** | ||
| 2638 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. | ||
| 2639 | * @q: the waitqueue | ||
| 2640 | * @mode: which threads | ||
| 2641 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | ||
| 2642 | * @key: opaque value to be passed to wakeup targets | ||
| 2643 | * | ||
| 2644 | * The sync wakeup differs that the waker knows that it will schedule | ||
| 2645 | * away soon, so while the target thread will be woken up, it will not | ||
| 2646 | * be migrated to another CPU - ie. the two threads are 'synchronized' | ||
| 2647 | * with each other. This can prevent needless bouncing between CPUs. | ||
| 2648 | * | ||
| 2649 | * On UP it can prevent extra preemption. | ||
| 2650 | * | ||
| 2651 | * It may be assumed that this function implies a write memory barrier before | ||
| 2652 | * changing the task state if and only if any tasks are woken up. | ||
| 2653 | */ | ||
| 2654 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | ||
| 2655 | int nr_exclusive, void *key) | ||
| 2656 | { | ||
| 2657 | unsigned long flags; | ||
| 2658 | int wake_flags = WF_SYNC; | ||
| 2659 | |||
| 2660 | if (unlikely(!q)) | ||
| 2661 | return; | ||
| 2662 | |||
| 2663 | if (unlikely(nr_exclusive != 1)) | ||
| 2664 | wake_flags = 0; | ||
| 2665 | |||
| 2666 | spin_lock_irqsave(&q->lock, flags); | ||
| 2667 | __wake_up_common(q, mode, nr_exclusive, wake_flags, key); | ||
| 2668 | spin_unlock_irqrestore(&q->lock, flags); | ||
| 2669 | } | ||
| 2670 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); | ||
| 2671 | |||
| 2672 | /* | ||
| 2673 | * __wake_up_sync - see __wake_up_sync_key() | ||
| 2674 | */ | ||
| 2675 | void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | ||
| 2676 | { | ||
| 2677 | __wake_up_sync_key(q, mode, nr_exclusive, NULL); | ||
| 2678 | } | ||
| 2679 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | ||
| 2680 | |||
| 2681 | /** | ||
| 2682 | * complete: - signals a single thread waiting on this completion | ||
| 2683 | * @x: holds the state of this particular completion | ||
| 2684 | * | ||
| 2685 | * This will wake up a single thread waiting on this completion. Threads will be | ||
| 2686 | * awakened in the same order in which they were queued. | ||
| 2687 | * | ||
| 2688 | * See also complete_all(), wait_for_completion() and related routines. | ||
| 2689 | * | ||
| 2690 | * It may be assumed that this function implies a write memory barrier before | ||
| 2691 | * changing the task state if and only if any tasks are woken up. | ||
| 2692 | */ | ||
| 2693 | void complete(struct completion *x) | ||
| 2694 | { | ||
| 2695 | unsigned long flags; | ||
| 2696 | |||
| 2697 | spin_lock_irqsave(&x->wait.lock, flags); | ||
| 2698 | x->done++; | ||
| 2699 | __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); | ||
| 2700 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
| 2701 | } | ||
| 2702 | EXPORT_SYMBOL(complete); | ||
| 2703 | |||
| 2704 | /** | ||
| 2705 | * complete_all: - signals all threads waiting on this completion | ||
| 2706 | * @x: holds the state of this particular completion | ||
| 2707 | * | ||
| 2708 | * This will wake up all threads waiting on this particular completion event. | ||
| 2709 | * | ||
| 2710 | * It may be assumed that this function implies a write memory barrier before | ||
| 2711 | * changing the task state if and only if any tasks are woken up. | ||
| 2712 | */ | ||
| 2713 | void complete_all(struct completion *x) | ||
| 2714 | { | ||
| 2715 | unsigned long flags; | ||
| 2716 | |||
| 2717 | spin_lock_irqsave(&x->wait.lock, flags); | ||
| 2718 | x->done += UINT_MAX/2; | ||
| 2719 | __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); | ||
| 2720 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
| 2721 | } | ||
| 2722 | EXPORT_SYMBOL(complete_all); | ||
| 2723 | |||
| 2724 | static inline long __sched | ||
| 2725 | do_wait_for_common(struct completion *x, | ||
| 2726 | long (*action)(long), long timeout, int state) | ||
| 2727 | { | ||
| 2728 | if (!x->done) { | ||
| 2729 | DECLARE_WAITQUEUE(wait, current); | ||
| 2730 | |||
| 2731 | __add_wait_queue_tail_exclusive(&x->wait, &wait); | ||
| 2732 | do { | ||
| 2733 | if (signal_pending_state(state, current)) { | ||
| 2734 | timeout = -ERESTARTSYS; | ||
| 2735 | break; | ||
| 2736 | } | ||
| 2737 | __set_current_state(state); | ||
| 2738 | spin_unlock_irq(&x->wait.lock); | ||
| 2739 | timeout = action(timeout); | ||
| 2740 | spin_lock_irq(&x->wait.lock); | ||
| 2741 | } while (!x->done && timeout); | ||
| 2742 | __remove_wait_queue(&x->wait, &wait); | ||
| 2743 | if (!x->done) | ||
| 2744 | return timeout; | ||
| 2745 | } | ||
| 2746 | x->done--; | ||
| 2747 | return timeout ?: 1; | ||
| 2748 | } | ||
| 2749 | |||
| 2750 | static inline long __sched | ||
| 2751 | __wait_for_common(struct completion *x, | ||
| 2752 | long (*action)(long), long timeout, int state) | ||
| 2753 | { | ||
| 2754 | might_sleep(); | ||
| 2755 | |||
| 2756 | spin_lock_irq(&x->wait.lock); | ||
| 2757 | timeout = do_wait_for_common(x, action, timeout, state); | ||
| 2758 | spin_unlock_irq(&x->wait.lock); | ||
| 2759 | return timeout; | ||
| 2760 | } | ||
| 2761 | |||
| 2762 | static long __sched | ||
| 2763 | wait_for_common(struct completion *x, long timeout, int state) | ||
| 2764 | { | ||
| 2765 | return __wait_for_common(x, schedule_timeout, timeout, state); | ||
| 2766 | } | ||
| 2767 | |||
| 2768 | static long __sched | ||
| 2769 | wait_for_common_io(struct completion *x, long timeout, int state) | ||
| 2770 | { | ||
| 2771 | return __wait_for_common(x, io_schedule_timeout, timeout, state); | ||
| 2772 | } | ||
| 2773 | |||
| 2774 | /** | ||
| 2775 | * wait_for_completion: - waits for completion of a task | ||
| 2776 | * @x: holds the state of this particular completion | ||
| 2777 | * | ||
| 2778 | * This waits to be signaled for completion of a specific task. It is NOT | ||
| 2779 | * interruptible and there is no timeout. | ||
| 2780 | * | ||
| 2781 | * See also similar routines (i.e. wait_for_completion_timeout()) with timeout | ||
| 2782 | * and interrupt capability. Also see complete(). | ||
| 2783 | */ | ||
| 2784 | void __sched wait_for_completion(struct completion *x) | ||
| 2785 | { | ||
| 2786 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
| 2787 | } | ||
| 2788 | EXPORT_SYMBOL(wait_for_completion); | ||
| 2789 | |||
| 2790 | /** | ||
| 2791 | * wait_for_completion_timeout: - waits for completion of a task (w/timeout) | ||
| 2792 | * @x: holds the state of this particular completion | ||
| 2793 | * @timeout: timeout value in jiffies | ||
| 2794 | * | ||
| 2795 | * This waits for either a completion of a specific task to be signaled or for a | ||
| 2796 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
| 2797 | * interruptible. | ||
| 2798 | * | ||
| 2799 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left | ||
| 2800 | * till timeout) if completed. | ||
| 2801 | */ | ||
| 2802 | unsigned long __sched | ||
| 2803 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | ||
| 2804 | { | ||
| 2805 | return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); | ||
| 2806 | } | ||
| 2807 | EXPORT_SYMBOL(wait_for_completion_timeout); | ||
| 2808 | |||
| 2809 | /** | ||
| 2810 | * wait_for_completion_io: - waits for completion of a task | ||
| 2811 | * @x: holds the state of this particular completion | ||
| 2812 | * | ||
| 2813 | * This waits to be signaled for completion of a specific task. It is NOT | ||
| 2814 | * interruptible and there is no timeout. The caller is accounted as waiting | ||
| 2815 | * for IO. | ||
| 2816 | */ | ||
| 2817 | void __sched wait_for_completion_io(struct completion *x) | ||
| 2818 | { | ||
| 2819 | wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
| 2820 | } | ||
| 2821 | EXPORT_SYMBOL(wait_for_completion_io); | ||
| 2822 | |||
| 2823 | /** | ||
| 2824 | * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) | ||
| 2825 | * @x: holds the state of this particular completion | ||
| 2826 | * @timeout: timeout value in jiffies | ||
| 2827 | * | ||
| 2828 | * This waits for either a completion of a specific task to be signaled or for a | ||
| 2829 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
| 2830 | * interruptible. The caller is accounted as waiting for IO. | ||
| 2831 | * | ||
| 2832 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left | ||
| 2833 | * till timeout) if completed. | ||
| 2834 | */ | ||
| 2835 | unsigned long __sched | ||
| 2836 | wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) | ||
| 2837 | { | ||
| 2838 | return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); | ||
| 2839 | } | ||
| 2840 | EXPORT_SYMBOL(wait_for_completion_io_timeout); | ||
| 2841 | |||
| 2842 | /** | ||
| 2843 | * wait_for_completion_interruptible: - waits for completion of a task (w/intr) | ||
| 2844 | * @x: holds the state of this particular completion | ||
| 2845 | * | ||
| 2846 | * This waits for completion of a specific task to be signaled. It is | ||
| 2847 | * interruptible. | ||
| 2848 | * | ||
| 2849 | * Return: -ERESTARTSYS if interrupted, 0 if completed. | ||
| 2850 | */ | ||
| 2851 | int __sched wait_for_completion_interruptible(struct completion *x) | ||
| 2852 | { | ||
| 2853 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); | ||
| 2854 | if (t == -ERESTARTSYS) | ||
| 2855 | return t; | ||
| 2856 | return 0; | ||
| 2857 | } | ||
| 2858 | EXPORT_SYMBOL(wait_for_completion_interruptible); | ||
| 2859 | |||
| 2860 | /** | ||
| 2861 | * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) | ||
| 2862 | * @x: holds the state of this particular completion | ||
| 2863 | * @timeout: timeout value in jiffies | ||
| 2864 | * | ||
| 2865 | * This waits for either a completion of a specific task to be signaled or for a | ||
| 2866 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | ||
| 2867 | * | ||
| 2868 | * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, | ||
| 2869 | * or number of jiffies left till timeout) if completed. | ||
| 2870 | */ | ||
| 2871 | long __sched | ||
| 2872 | wait_for_completion_interruptible_timeout(struct completion *x, | ||
| 2873 | unsigned long timeout) | ||
| 2874 | { | ||
| 2875 | return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); | ||
| 2876 | } | ||
| 2877 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | ||
| 2878 | |||
| 2879 | /** | ||
| 2880 | * wait_for_completion_killable: - waits for completion of a task (killable) | ||
| 2881 | * @x: holds the state of this particular completion | ||
| 2882 | * | ||
| 2883 | * This waits to be signaled for completion of a specific task. It can be | ||
| 2884 | * interrupted by a kill signal. | ||
| 2885 | * | ||
| 2886 | * Return: -ERESTARTSYS if interrupted, 0 if completed. | ||
| 2887 | */ | ||
| 2888 | int __sched wait_for_completion_killable(struct completion *x) | ||
| 2889 | { | ||
| 2890 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); | ||
| 2891 | if (t == -ERESTARTSYS) | ||
| 2892 | return t; | ||
| 2893 | return 0; | ||
| 2894 | } | ||
| 2895 | EXPORT_SYMBOL(wait_for_completion_killable); | ||
| 2896 | |||
| 2897 | /** | ||
| 2898 | * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) | ||
| 2899 | * @x: holds the state of this particular completion | ||
| 2900 | * @timeout: timeout value in jiffies | ||
| 2901 | * | ||
| 2902 | * This waits for either a completion of a specific task to be | ||
| 2903 | * signaled or for a specified timeout to expire. It can be | ||
| 2904 | * interrupted by a kill signal. The timeout is in jiffies. | ||
| 2905 | * | ||
| 2906 | * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, | ||
| 2907 | * or number of jiffies left till timeout) if completed. | ||
| 2908 | */ | ||
| 2909 | long __sched | ||
| 2910 | wait_for_completion_killable_timeout(struct completion *x, | ||
| 2911 | unsigned long timeout) | ||
| 2912 | { | ||
| 2913 | return wait_for_common(x, timeout, TASK_KILLABLE); | ||
| 2914 | } | ||
| 2915 | EXPORT_SYMBOL(wait_for_completion_killable_timeout); | ||
| 2916 | |||
| 2917 | /** | ||
| 2918 | * try_wait_for_completion - try to decrement a completion without blocking | ||
| 2919 | * @x: completion structure | ||
| 2920 | * | ||
| 2921 | * Return: 0 if a decrement cannot be done without blocking | ||
| 2922 | * 1 if a decrement succeeded. | ||
| 2923 | * | ||
| 2924 | * If a completion is being used as a counting completion, | ||
| 2925 | * attempt to decrement the counter without blocking. This | ||
| 2926 | * enables us to avoid waiting if the resource the completion | ||
| 2927 | * is protecting is not available. | ||
| 2928 | */ | ||
| 2929 | bool try_wait_for_completion(struct completion *x) | ||
| 2930 | { | ||
| 2931 | unsigned long flags; | ||
| 2932 | int ret = 1; | ||
| 2933 | |||
| 2934 | spin_lock_irqsave(&x->wait.lock, flags); | ||
| 2935 | if (!x->done) | ||
| 2936 | ret = 0; | ||
| 2937 | else | ||
| 2938 | x->done--; | ||
| 2939 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
| 2940 | return ret; | ||
| 2941 | } | ||
| 2942 | EXPORT_SYMBOL(try_wait_for_completion); | ||
| 2943 | |||
| 2944 | /** | ||
| 2945 | * completion_done - Test to see if a completion has any waiters | ||
| 2946 | * @x: completion structure | ||
| 2947 | * | ||
| 2948 | * Return: 0 if there are waiters (wait_for_completion() in progress) | ||
| 2949 | * 1 if there are no waiters. | ||
| 2950 | * | ||
| 2951 | */ | ||
| 2952 | bool completion_done(struct completion *x) | ||
| 2953 | { | ||
| 2954 | unsigned long flags; | ||
| 2955 | int ret = 1; | ||
| 2956 | |||
| 2957 | spin_lock_irqsave(&x->wait.lock, flags); | ||
| 2958 | if (!x->done) | ||
| 2959 | ret = 0; | ||
| 2960 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
| 2961 | return ret; | ||
| 2962 | } | ||
| 2963 | EXPORT_SYMBOL(completion_done); | ||
| 2964 | |||
| 2965 | static long __sched | 2691 | static long __sched |
| 2966 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) | 2692 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) |
| 2967 | { | 2693 | { |
| @@ -3598,13 +3324,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
| 3598 | struct task_struct *p; | 3324 | struct task_struct *p; |
| 3599 | int retval; | 3325 | int retval; |
| 3600 | 3326 | ||
| 3601 | get_online_cpus(); | ||
| 3602 | rcu_read_lock(); | 3327 | rcu_read_lock(); |
| 3603 | 3328 | ||
| 3604 | p = find_process_by_pid(pid); | 3329 | p = find_process_by_pid(pid); |
| 3605 | if (!p) { | 3330 | if (!p) { |
| 3606 | rcu_read_unlock(); | 3331 | rcu_read_unlock(); |
| 3607 | put_online_cpus(); | ||
| 3608 | return -ESRCH; | 3332 | return -ESRCH; |
| 3609 | } | 3333 | } |
| 3610 | 3334 | ||
| @@ -3661,7 +3385,6 @@ out_free_cpus_allowed: | |||
| 3661 | free_cpumask_var(cpus_allowed); | 3385 | free_cpumask_var(cpus_allowed); |
| 3662 | out_put_task: | 3386 | out_put_task: |
| 3663 | put_task_struct(p); | 3387 | put_task_struct(p); |
| 3664 | put_online_cpus(); | ||
| 3665 | return retval; | 3388 | return retval; |
| 3666 | } | 3389 | } |
| 3667 | 3390 | ||
| @@ -3706,7 +3429,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
| 3706 | unsigned long flags; | 3429 | unsigned long flags; |
| 3707 | int retval; | 3430 | int retval; |
| 3708 | 3431 | ||
| 3709 | get_online_cpus(); | ||
| 3710 | rcu_read_lock(); | 3432 | rcu_read_lock(); |
| 3711 | 3433 | ||
| 3712 | retval = -ESRCH; | 3434 | retval = -ESRCH; |
| @@ -3719,12 +3441,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
| 3719 | goto out_unlock; | 3441 | goto out_unlock; |
| 3720 | 3442 | ||
| 3721 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 3443 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
| 3722 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); | 3444 | cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); |
| 3723 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 3445 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
| 3724 | 3446 | ||
| 3725 | out_unlock: | 3447 | out_unlock: |
| 3726 | rcu_read_unlock(); | 3448 | rcu_read_unlock(); |
| 3727 | put_online_cpus(); | ||
| 3728 | 3449 | ||
| 3729 | return retval; | 3450 | return retval; |
| 3730 | } | 3451 | } |
| @@ -3794,16 +3515,11 @@ SYSCALL_DEFINE0(sched_yield) | |||
| 3794 | return 0; | 3515 | return 0; |
| 3795 | } | 3516 | } |
| 3796 | 3517 | ||
| 3797 | static inline int should_resched(void) | ||
| 3798 | { | ||
| 3799 | return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); | ||
| 3800 | } | ||
| 3801 | |||
| 3802 | static void __cond_resched(void) | 3518 | static void __cond_resched(void) |
| 3803 | { | 3519 | { |
| 3804 | add_preempt_count(PREEMPT_ACTIVE); | 3520 | __preempt_count_add(PREEMPT_ACTIVE); |
| 3805 | __schedule(); | 3521 | __schedule(); |
| 3806 | sub_preempt_count(PREEMPT_ACTIVE); | 3522 | __preempt_count_sub(PREEMPT_ACTIVE); |
| 3807 | } | 3523 | } |
| 3808 | 3524 | ||
| 3809 | int __sched _cond_resched(void) | 3525 | int __sched _cond_resched(void) |
| @@ -4186,7 +3902,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
| 4186 | 3902 | ||
| 4187 | raw_spin_lock_irqsave(&rq->lock, flags); | 3903 | raw_spin_lock_irqsave(&rq->lock, flags); |
| 4188 | 3904 | ||
| 4189 | __sched_fork(idle); | 3905 | __sched_fork(0, idle); |
| 4190 | idle->state = TASK_RUNNING; | 3906 | idle->state = TASK_RUNNING; |
| 4191 | idle->se.exec_start = sched_clock(); | 3907 | idle->se.exec_start = sched_clock(); |
| 4192 | 3908 | ||
| @@ -4212,7 +3928,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
| 4212 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 3928 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
| 4213 | 3929 | ||
| 4214 | /* Set the preempt count _outside_ the spinlocks! */ | 3930 | /* Set the preempt count _outside_ the spinlocks! */ |
| 4215 | task_thread_info(idle)->preempt_count = 0; | 3931 | init_idle_preempt_count(idle, cpu); |
| 4216 | 3932 | ||
| 4217 | /* | 3933 | /* |
| 4218 | * The idle tasks have their own, simple scheduling class: | 3934 | * The idle tasks have their own, simple scheduling class: |
| @@ -4346,6 +4062,53 @@ fail: | |||
| 4346 | return ret; | 4062 | return ret; |
| 4347 | } | 4063 | } |
| 4348 | 4064 | ||
| 4065 | #ifdef CONFIG_NUMA_BALANCING | ||
| 4066 | /* Migrate current task p to target_cpu */ | ||
| 4067 | int migrate_task_to(struct task_struct *p, int target_cpu) | ||
| 4068 | { | ||
| 4069 | struct migration_arg arg = { p, target_cpu }; | ||
| 4070 | int curr_cpu = task_cpu(p); | ||
| 4071 | |||
| 4072 | if (curr_cpu == target_cpu) | ||
| 4073 | return 0; | ||
| 4074 | |||
| 4075 | if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) | ||
| 4076 | return -EINVAL; | ||
| 4077 | |||
| 4078 | /* TODO: This is not properly updating schedstats */ | ||
| 4079 | |||
| 4080 | return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); | ||
| 4081 | } | ||
| 4082 | |||
| 4083 | /* | ||
| 4084 | * Requeue a task on a given node and accurately track the number of NUMA | ||
| 4085 | * tasks on the runqueues | ||
| 4086 | */ | ||
| 4087 | void sched_setnuma(struct task_struct *p, int nid) | ||
| 4088 | { | ||
| 4089 | struct rq *rq; | ||
| 4090 | unsigned long flags; | ||
| 4091 | bool on_rq, running; | ||
| 4092 | |||
| 4093 | rq = task_rq_lock(p, &flags); | ||
| 4094 | on_rq = p->on_rq; | ||
| 4095 | running = task_current(rq, p); | ||
| 4096 | |||
| 4097 | if (on_rq) | ||
| 4098 | dequeue_task(rq, p, 0); | ||
| 4099 | if (running) | ||
| 4100 | p->sched_class->put_prev_task(rq, p); | ||
| 4101 | |||
| 4102 | p->numa_preferred_nid = nid; | ||
| 4103 | |||
| 4104 | if (running) | ||
| 4105 | p->sched_class->set_curr_task(rq); | ||
| 4106 | if (on_rq) | ||
| 4107 | enqueue_task(rq, p, 0); | ||
| 4108 | task_rq_unlock(rq, p, &flags); | ||
| 4109 | } | ||
| 4110 | #endif | ||
| 4111 | |||
| 4349 | /* | 4112 | /* |
| 4350 | * migration_cpu_stop - this will be executed by a highprio stopper thread | 4113 | * migration_cpu_stop - this will be executed by a highprio stopper thread |
| 4351 | * and performs thread migration by bumping thread off CPU then | 4114 | * and performs thread migration by bumping thread off CPU then |
| @@ -5119,6 +4882,9 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
| 5119 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | 4882 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); |
| 5120 | DEFINE_PER_CPU(int, sd_llc_size); | 4883 | DEFINE_PER_CPU(int, sd_llc_size); |
| 5121 | DEFINE_PER_CPU(int, sd_llc_id); | 4884 | DEFINE_PER_CPU(int, sd_llc_id); |
| 4885 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); | ||
| 4886 | DEFINE_PER_CPU(struct sched_domain *, sd_busy); | ||
| 4887 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); | ||
| 5122 | 4888 | ||
| 5123 | static void update_top_cache_domain(int cpu) | 4889 | static void update_top_cache_domain(int cpu) |
| 5124 | { | 4890 | { |
| @@ -5130,11 +4896,18 @@ static void update_top_cache_domain(int cpu) | |||
| 5130 | if (sd) { | 4896 | if (sd) { |
| 5131 | id = cpumask_first(sched_domain_span(sd)); | 4897 | id = cpumask_first(sched_domain_span(sd)); |
| 5132 | size = cpumask_weight(sched_domain_span(sd)); | 4898 | size = cpumask_weight(sched_domain_span(sd)); |
| 4899 | rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent); | ||
| 5133 | } | 4900 | } |
| 5134 | 4901 | ||
| 5135 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | 4902 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); |
| 5136 | per_cpu(sd_llc_size, cpu) = size; | 4903 | per_cpu(sd_llc_size, cpu) = size; |
| 5137 | per_cpu(sd_llc_id, cpu) = id; | 4904 | per_cpu(sd_llc_id, cpu) = id; |
| 4905 | |||
| 4906 | sd = lowest_flag_domain(cpu, SD_NUMA); | ||
| 4907 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); | ||
| 4908 | |||
| 4909 | sd = highest_flag_domain(cpu, SD_ASYM_PACKING); | ||
| 4910 | rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); | ||
| 5138 | } | 4911 | } |
| 5139 | 4912 | ||
| 5140 | /* | 4913 | /* |
| @@ -5654,6 +5427,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | |||
| 5654 | | 0*SD_SHARE_PKG_RESOURCES | 5427 | | 0*SD_SHARE_PKG_RESOURCES |
| 5655 | | 1*SD_SERIALIZE | 5428 | | 1*SD_SERIALIZE |
| 5656 | | 0*SD_PREFER_SIBLING | 5429 | | 0*SD_PREFER_SIBLING |
| 5430 | | 1*SD_NUMA | ||
| 5657 | | sd_local_flags(level) | 5431 | | sd_local_flags(level) |
| 5658 | , | 5432 | , |
| 5659 | .last_balance = jiffies, | 5433 | .last_balance = jiffies, |
| @@ -6335,14 +6109,17 @@ void __init sched_init_smp(void) | |||
| 6335 | 6109 | ||
| 6336 | sched_init_numa(); | 6110 | sched_init_numa(); |
| 6337 | 6111 | ||
| 6338 | get_online_cpus(); | 6112 | /* |
| 6113 | * There's no userspace yet to cause hotplug operations; hence all the | ||
| 6114 | * cpu masks are stable and all blatant races in the below code cannot | ||
| 6115 | * happen. | ||
| 6116 | */ | ||
| 6339 | mutex_lock(&sched_domains_mutex); | 6117 | mutex_lock(&sched_domains_mutex); |
| 6340 | init_sched_domains(cpu_active_mask); | 6118 | init_sched_domains(cpu_active_mask); |
| 6341 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 6119 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
| 6342 | if (cpumask_empty(non_isolated_cpus)) | 6120 | if (cpumask_empty(non_isolated_cpus)) |
| 6343 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 6121 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
| 6344 | mutex_unlock(&sched_domains_mutex); | 6122 | mutex_unlock(&sched_domains_mutex); |
| 6345 | put_online_cpus(); | ||
| 6346 | 6123 | ||
| 6347 | hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); | 6124 | hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); |
| 6348 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); | 6125 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); |
| @@ -6505,6 +6282,7 @@ void __init sched_init(void) | |||
| 6505 | rq->online = 0; | 6282 | rq->online = 0; |
| 6506 | rq->idle_stamp = 0; | 6283 | rq->idle_stamp = 0; |
| 6507 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 6284 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
| 6285 | rq->max_idle_balance_cost = sysctl_sched_migration_cost; | ||
| 6508 | 6286 | ||
| 6509 | INIT_LIST_HEAD(&rq->cfs_tasks); | 6287 | INIT_LIST_HEAD(&rq->cfs_tasks); |
| 6510 | 6288 | ||
| @@ -7277,7 +7055,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
| 7277 | 7055 | ||
| 7278 | runtime_enabled = quota != RUNTIME_INF; | 7056 | runtime_enabled = quota != RUNTIME_INF; |
| 7279 | runtime_was_enabled = cfs_b->quota != RUNTIME_INF; | 7057 | runtime_was_enabled = cfs_b->quota != RUNTIME_INF; |
| 7280 | account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); | 7058 | /* |
| 7059 | * If we need to toggle cfs_bandwidth_used, off->on must occur | ||
| 7060 | * before making related changes, and on->off must occur afterwards | ||
| 7061 | */ | ||
| 7062 | if (runtime_enabled && !runtime_was_enabled) | ||
| 7063 | cfs_bandwidth_usage_inc(); | ||
| 7281 | raw_spin_lock_irq(&cfs_b->lock); | 7064 | raw_spin_lock_irq(&cfs_b->lock); |
| 7282 | cfs_b->period = ns_to_ktime(period); | 7065 | cfs_b->period = ns_to_ktime(period); |
| 7283 | cfs_b->quota = quota; | 7066 | cfs_b->quota = quota; |
| @@ -7303,6 +7086,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
| 7303 | unthrottle_cfs_rq(cfs_rq); | 7086 | unthrottle_cfs_rq(cfs_rq); |
| 7304 | raw_spin_unlock_irq(&rq->lock); | 7087 | raw_spin_unlock_irq(&rq->lock); |
| 7305 | } | 7088 | } |
| 7089 | if (runtime_was_enabled && !runtime_enabled) | ||
| 7090 | cfs_bandwidth_usage_dec(); | ||
| 7306 | out_unlock: | 7091 | out_unlock: |
| 7307 | mutex_unlock(&cfs_constraints_mutex); | 7092 | mutex_unlock(&cfs_constraints_mutex); |
| 7308 | 7093 | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 196559994f7c..5c34d1817e8f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/seq_file.h> | 15 | #include <linux/seq_file.h> |
| 16 | #include <linux/kallsyms.h> | 16 | #include <linux/kallsyms.h> |
| 17 | #include <linux/utsname.h> | 17 | #include <linux/utsname.h> |
| 18 | #include <linux/mempolicy.h> | ||
| 18 | 19 | ||
| 19 | #include "sched.h" | 20 | #include "sched.h" |
| 20 | 21 | ||
| @@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
| 137 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", | 138 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", |
| 138 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 139 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
| 139 | #endif | 140 | #endif |
| 141 | #ifdef CONFIG_NUMA_BALANCING | ||
| 142 | SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); | ||
| 143 | #endif | ||
| 140 | #ifdef CONFIG_CGROUP_SCHED | 144 | #ifdef CONFIG_CGROUP_SCHED |
| 141 | SEQ_printf(m, " %s", task_group_path(task_group(p))); | 145 | SEQ_printf(m, " %s", task_group_path(task_group(p))); |
| 142 | #endif | 146 | #endif |
| @@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
| 159 | read_lock_irqsave(&tasklist_lock, flags); | 163 | read_lock_irqsave(&tasklist_lock, flags); |
| 160 | 164 | ||
| 161 | do_each_thread(g, p) { | 165 | do_each_thread(g, p) { |
| 162 | if (!p->on_rq || task_cpu(p) != rq_cpu) | 166 | if (task_cpu(p) != rq_cpu) |
| 163 | continue; | 167 | continue; |
| 164 | 168 | ||
| 165 | print_task(m, rq, p); | 169 | print_task(m, rq, p); |
| @@ -225,6 +229,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 225 | atomic_read(&cfs_rq->tg->runnable_avg)); | 229 | atomic_read(&cfs_rq->tg->runnable_avg)); |
| 226 | #endif | 230 | #endif |
| 227 | #endif | 231 | #endif |
| 232 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 233 | SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active", | ||
| 234 | cfs_rq->tg->cfs_bandwidth.timer_active); | ||
| 235 | SEQ_printf(m, " .%-30s: %d\n", "throttled", | ||
| 236 | cfs_rq->throttled); | ||
| 237 | SEQ_printf(m, " .%-30s: %d\n", "throttle_count", | ||
| 238 | cfs_rq->throttle_count); | ||
| 239 | #endif | ||
| 228 | 240 | ||
| 229 | #ifdef CONFIG_FAIR_GROUP_SCHED | 241 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 230 | print_cfs_group_stats(m, cpu, cfs_rq->tg); | 242 | print_cfs_group_stats(m, cpu, cfs_rq->tg); |
| @@ -345,7 +357,7 @@ static void sched_debug_header(struct seq_file *m) | |||
| 345 | cpu_clk = local_clock(); | 357 | cpu_clk = local_clock(); |
| 346 | local_irq_restore(flags); | 358 | local_irq_restore(flags); |
| 347 | 359 | ||
| 348 | SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", | 360 | SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n", |
| 349 | init_utsname()->release, | 361 | init_utsname()->release, |
| 350 | (int)strcspn(init_utsname()->version, " "), | 362 | (int)strcspn(init_utsname()->version, " "), |
| 351 | init_utsname()->version); | 363 | init_utsname()->version); |
| @@ -488,6 +500,56 @@ static int __init init_sched_debug_procfs(void) | |||
| 488 | 500 | ||
| 489 | __initcall(init_sched_debug_procfs); | 501 | __initcall(init_sched_debug_procfs); |
| 490 | 502 | ||
| 503 | #define __P(F) \ | ||
| 504 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) | ||
| 505 | #define P(F) \ | ||
| 506 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) | ||
| 507 | #define __PN(F) \ | ||
| 508 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) | ||
| 509 | #define PN(F) \ | ||
| 510 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) | ||
| 511 | |||
| 512 | |||
| 513 | static void sched_show_numa(struct task_struct *p, struct seq_file *m) | ||
| 514 | { | ||
| 515 | #ifdef CONFIG_NUMA_BALANCING | ||
| 516 | struct mempolicy *pol; | ||
| 517 | int node, i; | ||
| 518 | |||
| 519 | if (p->mm) | ||
| 520 | P(mm->numa_scan_seq); | ||
| 521 | |||
| 522 | task_lock(p); | ||
| 523 | pol = p->mempolicy; | ||
| 524 | if (pol && !(pol->flags & MPOL_F_MORON)) | ||
| 525 | pol = NULL; | ||
| 526 | mpol_get(pol); | ||
| 527 | task_unlock(p); | ||
| 528 | |||
| 529 | SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0)); | ||
| 530 | |||
| 531 | for_each_online_node(node) { | ||
| 532 | for (i = 0; i < 2; i++) { | ||
| 533 | unsigned long nr_faults = -1; | ||
| 534 | int cpu_current, home_node; | ||
| 535 | |||
| 536 | if (p->numa_faults) | ||
| 537 | nr_faults = p->numa_faults[2*node + i]; | ||
| 538 | |||
| 539 | cpu_current = !i ? (task_node(p) == node) : | ||
| 540 | (pol && node_isset(node, pol->v.nodes)); | ||
| 541 | |||
| 542 | home_node = (p->numa_preferred_nid == node); | ||
| 543 | |||
| 544 | SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", | ||
| 545 | i, node, cpu_current, home_node, nr_faults); | ||
| 546 | } | ||
| 547 | } | ||
| 548 | |||
| 549 | mpol_put(pol); | ||
| 550 | #endif | ||
| 551 | } | ||
| 552 | |||
| 491 | void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | 553 | void proc_sched_show_task(struct task_struct *p, struct seq_file *m) |
| 492 | { | 554 | { |
| 493 | unsigned long nr_switches; | 555 | unsigned long nr_switches; |
| @@ -591,6 +653,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 591 | SEQ_printf(m, "%-45s:%21Ld\n", | 653 | SEQ_printf(m, "%-45s:%21Ld\n", |
| 592 | "clock-delta", (long long)(t1-t0)); | 654 | "clock-delta", (long long)(t1-t0)); |
| 593 | } | 655 | } |
| 656 | |||
| 657 | sched_show_numa(p, m); | ||
| 594 | } | 658 | } |
| 595 | 659 | ||
| 596 | void proc_sched_set_task(struct task_struct *p) | 660 | void proc_sched_set_task(struct task_struct *p) |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c70201fbc61..df77c605c7a6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 681 | } | 681 | } |
| 682 | 682 | ||
| 683 | #ifdef CONFIG_SMP | 683 | #ifdef CONFIG_SMP |
| 684 | static unsigned long task_h_load(struct task_struct *p); | ||
| 685 | |||
| 684 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 686 | static inline void __update_task_entity_contrib(struct sched_entity *se); |
| 685 | 687 | ||
| 686 | /* Give new task start runnable values to heavy its load in infant time */ | 688 | /* Give new task start runnable values to heavy its load in infant time */ |
| @@ -818,11 +820,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 818 | 820 | ||
| 819 | #ifdef CONFIG_NUMA_BALANCING | 821 | #ifdef CONFIG_NUMA_BALANCING |
| 820 | /* | 822 | /* |
| 821 | * numa task sample period in ms | 823 | * Approximate time to scan a full NUMA task in ms. The task scan period is |
| 824 | * calculated based on the tasks virtual memory size and | ||
| 825 | * numa_balancing_scan_size. | ||
| 822 | */ | 826 | */ |
| 823 | unsigned int sysctl_numa_balancing_scan_period_min = 100; | 827 | unsigned int sysctl_numa_balancing_scan_period_min = 1000; |
| 824 | unsigned int sysctl_numa_balancing_scan_period_max = 100*50; | 828 | unsigned int sysctl_numa_balancing_scan_period_max = 60000; |
| 825 | unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; | ||
| 826 | 829 | ||
| 827 | /* Portion of address space to scan in MB */ | 830 | /* Portion of address space to scan in MB */ |
| 828 | unsigned int sysctl_numa_balancing_scan_size = 256; | 831 | unsigned int sysctl_numa_balancing_scan_size = 256; |
| @@ -830,41 +833,810 @@ unsigned int sysctl_numa_balancing_scan_size = 256; | |||
| 830 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ | 833 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ |
| 831 | unsigned int sysctl_numa_balancing_scan_delay = 1000; | 834 | unsigned int sysctl_numa_balancing_scan_delay = 1000; |
| 832 | 835 | ||
| 833 | static void task_numa_placement(struct task_struct *p) | 836 | /* |
| 837 | * After skipping a page migration on a shared page, skip N more numa page | ||
| 838 | * migrations unconditionally. This reduces the number of NUMA migrations | ||
| 839 | * in shared memory workloads, and has the effect of pulling tasks towards | ||
| 840 | * where their memory lives, over pulling the memory towards the task. | ||
| 841 | */ | ||
| 842 | unsigned int sysctl_numa_balancing_migrate_deferred = 16; | ||
| 843 | |||
| 844 | static unsigned int task_nr_scan_windows(struct task_struct *p) | ||
| 845 | { | ||
| 846 | unsigned long rss = 0; | ||
| 847 | unsigned long nr_scan_pages; | ||
| 848 | |||
| 849 | /* | ||
| 850 | * Calculations based on RSS as non-present and empty pages are skipped | ||
| 851 | * by the PTE scanner and NUMA hinting faults should be trapped based | ||
| 852 | * on resident pages | ||
| 853 | */ | ||
| 854 | nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); | ||
| 855 | rss = get_mm_rss(p->mm); | ||
| 856 | if (!rss) | ||
| 857 | rss = nr_scan_pages; | ||
| 858 | |||
| 859 | rss = round_up(rss, nr_scan_pages); | ||
| 860 | return rss / nr_scan_pages; | ||
| 861 | } | ||
| 862 | |||
| 863 | /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ | ||
| 864 | #define MAX_SCAN_WINDOW 2560 | ||
| 865 | |||
| 866 | static unsigned int task_scan_min(struct task_struct *p) | ||
| 867 | { | ||
| 868 | unsigned int scan, floor; | ||
| 869 | unsigned int windows = 1; | ||
| 870 | |||
| 871 | if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) | ||
| 872 | windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; | ||
| 873 | floor = 1000 / windows; | ||
| 874 | |||
| 875 | scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); | ||
| 876 | return max_t(unsigned int, floor, scan); | ||
| 877 | } | ||
| 878 | |||
| 879 | static unsigned int task_scan_max(struct task_struct *p) | ||
| 880 | { | ||
| 881 | unsigned int smin = task_scan_min(p); | ||
| 882 | unsigned int smax; | ||
| 883 | |||
| 884 | /* Watch for min being lower than max due to floor calculations */ | ||
| 885 | smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); | ||
| 886 | return max(smin, smax); | ||
| 887 | } | ||
| 888 | |||
| 889 | /* | ||
| 890 | * Once a preferred node is selected the scheduler balancer will prefer moving | ||
| 891 | * a task to that node for sysctl_numa_balancing_settle_count number of PTE | ||
| 892 | * scans. This will give the process the chance to accumulate more faults on | ||
| 893 | * the preferred node but still allow the scheduler to move the task again if | ||
| 894 | * the nodes CPUs are overloaded. | ||
| 895 | */ | ||
| 896 | unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; | ||
| 897 | |||
| 898 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) | ||
| 899 | { | ||
| 900 | rq->nr_numa_running += (p->numa_preferred_nid != -1); | ||
| 901 | rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); | ||
| 902 | } | ||
| 903 | |||
| 904 | static void account_numa_dequeue(struct rq *rq, struct task_struct *p) | ||
| 905 | { | ||
| 906 | rq->nr_numa_running -= (p->numa_preferred_nid != -1); | ||
| 907 | rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); | ||
| 908 | } | ||
| 909 | |||
| 910 | struct numa_group { | ||
| 911 | atomic_t refcount; | ||
| 912 | |||
| 913 | spinlock_t lock; /* nr_tasks, tasks */ | ||
| 914 | int nr_tasks; | ||
| 915 | pid_t gid; | ||
| 916 | struct list_head task_list; | ||
| 917 | |||
| 918 | struct rcu_head rcu; | ||
| 919 | unsigned long total_faults; | ||
| 920 | unsigned long faults[0]; | ||
| 921 | }; | ||
| 922 | |||
| 923 | pid_t task_numa_group_id(struct task_struct *p) | ||
| 924 | { | ||
| 925 | return p->numa_group ? p->numa_group->gid : 0; | ||
| 926 | } | ||
| 927 | |||
| 928 | static inline int task_faults_idx(int nid, int priv) | ||
| 929 | { | ||
| 930 | return 2 * nid + priv; | ||
| 931 | } | ||
| 932 | |||
| 933 | static inline unsigned long task_faults(struct task_struct *p, int nid) | ||
| 934 | { | ||
| 935 | if (!p->numa_faults) | ||
| 936 | return 0; | ||
| 937 | |||
| 938 | return p->numa_faults[task_faults_idx(nid, 0)] + | ||
| 939 | p->numa_faults[task_faults_idx(nid, 1)]; | ||
| 940 | } | ||
| 941 | |||
| 942 | static inline unsigned long group_faults(struct task_struct *p, int nid) | ||
| 943 | { | ||
| 944 | if (!p->numa_group) | ||
| 945 | return 0; | ||
| 946 | |||
| 947 | return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; | ||
| 948 | } | ||
| 949 | |||
| 950 | /* | ||
| 951 | * These return the fraction of accesses done by a particular task, or | ||
| 952 | * task group, on a particular numa node. The group weight is given a | ||
| 953 | * larger multiplier, in order to group tasks together that are almost | ||
| 954 | * evenly spread out between numa nodes. | ||
| 955 | */ | ||
| 956 | static inline unsigned long task_weight(struct task_struct *p, int nid) | ||
| 957 | { | ||
| 958 | unsigned long total_faults; | ||
| 959 | |||
| 960 | if (!p->numa_faults) | ||
| 961 | return 0; | ||
| 962 | |||
| 963 | total_faults = p->total_numa_faults; | ||
| 964 | |||
| 965 | if (!total_faults) | ||
| 966 | return 0; | ||
| 967 | |||
| 968 | return 1000 * task_faults(p, nid) / total_faults; | ||
| 969 | } | ||
| 970 | |||
| 971 | static inline unsigned long group_weight(struct task_struct *p, int nid) | ||
| 834 | { | 972 | { |
| 835 | int seq; | 973 | if (!p->numa_group || !p->numa_group->total_faults) |
| 974 | return 0; | ||
| 836 | 975 | ||
| 837 | if (!p->mm) /* for example, ksmd faulting in a user's mm */ | 976 | return 1000 * group_faults(p, nid) / p->numa_group->total_faults; |
| 977 | } | ||
| 978 | |||
| 979 | static unsigned long weighted_cpuload(const int cpu); | ||
| 980 | static unsigned long source_load(int cpu, int type); | ||
| 981 | static unsigned long target_load(int cpu, int type); | ||
| 982 | static unsigned long power_of(int cpu); | ||
| 983 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg); | ||
| 984 | |||
| 985 | /* Cached statistics for all CPUs within a node */ | ||
| 986 | struct numa_stats { | ||
| 987 | unsigned long nr_running; | ||
| 988 | unsigned long load; | ||
| 989 | |||
| 990 | /* Total compute capacity of CPUs on a node */ | ||
| 991 | unsigned long power; | ||
| 992 | |||
| 993 | /* Approximate capacity in terms of runnable tasks on a node */ | ||
| 994 | unsigned long capacity; | ||
| 995 | int has_capacity; | ||
| 996 | }; | ||
| 997 | |||
| 998 | /* | ||
| 999 | * XXX borrowed from update_sg_lb_stats | ||
| 1000 | */ | ||
| 1001 | static void update_numa_stats(struct numa_stats *ns, int nid) | ||
| 1002 | { | ||
| 1003 | int cpu; | ||
| 1004 | |||
| 1005 | memset(ns, 0, sizeof(*ns)); | ||
| 1006 | for_each_cpu(cpu, cpumask_of_node(nid)) { | ||
| 1007 | struct rq *rq = cpu_rq(cpu); | ||
| 1008 | |||
| 1009 | ns->nr_running += rq->nr_running; | ||
| 1010 | ns->load += weighted_cpuload(cpu); | ||
| 1011 | ns->power += power_of(cpu); | ||
| 1012 | } | ||
| 1013 | |||
| 1014 | ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; | ||
| 1015 | ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); | ||
| 1016 | ns->has_capacity = (ns->nr_running < ns->capacity); | ||
| 1017 | } | ||
| 1018 | |||
| 1019 | struct task_numa_env { | ||
| 1020 | struct task_struct *p; | ||
| 1021 | |||
| 1022 | int src_cpu, src_nid; | ||
| 1023 | int dst_cpu, dst_nid; | ||
| 1024 | |||
| 1025 | struct numa_stats src_stats, dst_stats; | ||
| 1026 | |||
| 1027 | int imbalance_pct, idx; | ||
| 1028 | |||
| 1029 | struct task_struct *best_task; | ||
| 1030 | long best_imp; | ||
| 1031 | int best_cpu; | ||
| 1032 | }; | ||
| 1033 | |||
| 1034 | static void task_numa_assign(struct task_numa_env *env, | ||
| 1035 | struct task_struct *p, long imp) | ||
| 1036 | { | ||
| 1037 | if (env->best_task) | ||
| 1038 | put_task_struct(env->best_task); | ||
| 1039 | if (p) | ||
| 1040 | get_task_struct(p); | ||
| 1041 | |||
| 1042 | env->best_task = p; | ||
| 1043 | env->best_imp = imp; | ||
| 1044 | env->best_cpu = env->dst_cpu; | ||
| 1045 | } | ||
| 1046 | |||
| 1047 | /* | ||
| 1048 | * This checks if the overall compute and NUMA accesses of the system would | ||
| 1049 | * be improved if the source tasks was migrated to the target dst_cpu taking | ||
| 1050 | * into account that it might be best if task running on the dst_cpu should | ||
| 1051 | * be exchanged with the source task | ||
| 1052 | */ | ||
| 1053 | static void task_numa_compare(struct task_numa_env *env, | ||
| 1054 | long taskimp, long groupimp) | ||
| 1055 | { | ||
| 1056 | struct rq *src_rq = cpu_rq(env->src_cpu); | ||
| 1057 | struct rq *dst_rq = cpu_rq(env->dst_cpu); | ||
| 1058 | struct task_struct *cur; | ||
| 1059 | long dst_load, src_load; | ||
| 1060 | long load; | ||
| 1061 | long imp = (groupimp > 0) ? groupimp : taskimp; | ||
| 1062 | |||
| 1063 | rcu_read_lock(); | ||
| 1064 | cur = ACCESS_ONCE(dst_rq->curr); | ||
| 1065 | if (cur->pid == 0) /* idle */ | ||
| 1066 | cur = NULL; | ||
| 1067 | |||
| 1068 | /* | ||
| 1069 | * "imp" is the fault differential for the source task between the | ||
| 1070 | * source and destination node. Calculate the total differential for | ||
| 1071 | * the source task and potential destination task. The more negative | ||
| 1072 | * the value is, the more rmeote accesses that would be expected to | ||
| 1073 | * be incurred if the tasks were swapped. | ||
| 1074 | */ | ||
| 1075 | if (cur) { | ||
| 1076 | /* Skip this swap candidate if cannot move to the source cpu */ | ||
| 1077 | if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) | ||
| 1078 | goto unlock; | ||
| 1079 | |||
| 1080 | /* | ||
| 1081 | * If dst and source tasks are in the same NUMA group, or not | ||
| 1082 | * in any group then look only at task weights. | ||
| 1083 | */ | ||
| 1084 | if (cur->numa_group == env->p->numa_group) { | ||
| 1085 | imp = taskimp + task_weight(cur, env->src_nid) - | ||
| 1086 | task_weight(cur, env->dst_nid); | ||
| 1087 | /* | ||
| 1088 | * Add some hysteresis to prevent swapping the | ||
| 1089 | * tasks within a group over tiny differences. | ||
| 1090 | */ | ||
| 1091 | if (cur->numa_group) | ||
| 1092 | imp -= imp/16; | ||
| 1093 | } else { | ||
| 1094 | /* | ||
| 1095 | * Compare the group weights. If a task is all by | ||
| 1096 | * itself (not part of a group), use the task weight | ||
| 1097 | * instead. | ||
| 1098 | */ | ||
| 1099 | if (env->p->numa_group) | ||
| 1100 | imp = groupimp; | ||
| 1101 | else | ||
| 1102 | imp = taskimp; | ||
| 1103 | |||
| 1104 | if (cur->numa_group) | ||
| 1105 | imp += group_weight(cur, env->src_nid) - | ||
| 1106 | group_weight(cur, env->dst_nid); | ||
| 1107 | else | ||
| 1108 | imp += task_weight(cur, env->src_nid) - | ||
| 1109 | task_weight(cur, env->dst_nid); | ||
| 1110 | } | ||
| 1111 | } | ||
| 1112 | |||
| 1113 | if (imp < env->best_imp) | ||
| 1114 | goto unlock; | ||
| 1115 | |||
| 1116 | if (!cur) { | ||
| 1117 | /* Is there capacity at our destination? */ | ||
| 1118 | if (env->src_stats.has_capacity && | ||
| 1119 | !env->dst_stats.has_capacity) | ||
| 1120 | goto unlock; | ||
| 1121 | |||
| 1122 | goto balance; | ||
| 1123 | } | ||
| 1124 | |||
| 1125 | /* Balance doesn't matter much if we're running a task per cpu */ | ||
| 1126 | if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) | ||
| 1127 | goto assign; | ||
| 1128 | |||
| 1129 | /* | ||
| 1130 | * In the overloaded case, try and keep the load balanced. | ||
| 1131 | */ | ||
| 1132 | balance: | ||
| 1133 | dst_load = env->dst_stats.load; | ||
| 1134 | src_load = env->src_stats.load; | ||
| 1135 | |||
| 1136 | /* XXX missing power terms */ | ||
| 1137 | load = task_h_load(env->p); | ||
| 1138 | dst_load += load; | ||
| 1139 | src_load -= load; | ||
| 1140 | |||
| 1141 | if (cur) { | ||
| 1142 | load = task_h_load(cur); | ||
| 1143 | dst_load -= load; | ||
| 1144 | src_load += load; | ||
| 1145 | } | ||
| 1146 | |||
| 1147 | /* make src_load the smaller */ | ||
| 1148 | if (dst_load < src_load) | ||
| 1149 | swap(dst_load, src_load); | ||
| 1150 | |||
| 1151 | if (src_load * env->imbalance_pct < dst_load * 100) | ||
| 1152 | goto unlock; | ||
| 1153 | |||
| 1154 | assign: | ||
| 1155 | task_numa_assign(env, cur, imp); | ||
| 1156 | unlock: | ||
| 1157 | rcu_read_unlock(); | ||
| 1158 | } | ||
| 1159 | |||
| 1160 | static void task_numa_find_cpu(struct task_numa_env *env, | ||
| 1161 | long taskimp, long groupimp) | ||
| 1162 | { | ||
| 1163 | int cpu; | ||
| 1164 | |||
| 1165 | for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { | ||
| 1166 | /* Skip this CPU if the source task cannot migrate */ | ||
| 1167 | if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p))) | ||
| 1168 | continue; | ||
| 1169 | |||
| 1170 | env->dst_cpu = cpu; | ||
| 1171 | task_numa_compare(env, taskimp, groupimp); | ||
| 1172 | } | ||
| 1173 | } | ||
| 1174 | |||
| 1175 | static int task_numa_migrate(struct task_struct *p) | ||
| 1176 | { | ||
| 1177 | struct task_numa_env env = { | ||
| 1178 | .p = p, | ||
| 1179 | |||
| 1180 | .src_cpu = task_cpu(p), | ||
| 1181 | .src_nid = task_node(p), | ||
| 1182 | |||
| 1183 | .imbalance_pct = 112, | ||
| 1184 | |||
| 1185 | .best_task = NULL, | ||
| 1186 | .best_imp = 0, | ||
| 1187 | .best_cpu = -1 | ||
| 1188 | }; | ||
| 1189 | struct sched_domain *sd; | ||
| 1190 | unsigned long taskweight, groupweight; | ||
| 1191 | int nid, ret; | ||
| 1192 | long taskimp, groupimp; | ||
| 1193 | |||
| 1194 | /* | ||
| 1195 | * Pick the lowest SD_NUMA domain, as that would have the smallest | ||
| 1196 | * imbalance and would be the first to start moving tasks about. | ||
| 1197 | * | ||
| 1198 | * And we want to avoid any moving of tasks about, as that would create | ||
| 1199 | * random movement of tasks -- counter the numa conditions we're trying | ||
| 1200 | * to satisfy here. | ||
| 1201 | */ | ||
| 1202 | rcu_read_lock(); | ||
| 1203 | sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); | ||
| 1204 | env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; | ||
| 1205 | rcu_read_unlock(); | ||
| 1206 | |||
| 1207 | taskweight = task_weight(p, env.src_nid); | ||
| 1208 | groupweight = group_weight(p, env.src_nid); | ||
| 1209 | update_numa_stats(&env.src_stats, env.src_nid); | ||
| 1210 | env.dst_nid = p->numa_preferred_nid; | ||
| 1211 | taskimp = task_weight(p, env.dst_nid) - taskweight; | ||
| 1212 | groupimp = group_weight(p, env.dst_nid) - groupweight; | ||
| 1213 | update_numa_stats(&env.dst_stats, env.dst_nid); | ||
| 1214 | |||
| 1215 | /* If the preferred nid has capacity, try to use it. */ | ||
| 1216 | if (env.dst_stats.has_capacity) | ||
| 1217 | task_numa_find_cpu(&env, taskimp, groupimp); | ||
| 1218 | |||
| 1219 | /* No space available on the preferred nid. Look elsewhere. */ | ||
| 1220 | if (env.best_cpu == -1) { | ||
| 1221 | for_each_online_node(nid) { | ||
| 1222 | if (nid == env.src_nid || nid == p->numa_preferred_nid) | ||
| 1223 | continue; | ||
| 1224 | |||
| 1225 | /* Only consider nodes where both task and groups benefit */ | ||
| 1226 | taskimp = task_weight(p, nid) - taskweight; | ||
| 1227 | groupimp = group_weight(p, nid) - groupweight; | ||
| 1228 | if (taskimp < 0 && groupimp < 0) | ||
| 1229 | continue; | ||
| 1230 | |||
| 1231 | env.dst_nid = nid; | ||
| 1232 | update_numa_stats(&env.dst_stats, env.dst_nid); | ||
| 1233 | task_numa_find_cpu(&env, taskimp, groupimp); | ||
| 1234 | } | ||
| 1235 | } | ||
| 1236 | |||
| 1237 | /* No better CPU than the current one was found. */ | ||
| 1238 | if (env.best_cpu == -1) | ||
| 1239 | return -EAGAIN; | ||
| 1240 | |||
| 1241 | sched_setnuma(p, env.dst_nid); | ||
| 1242 | |||
| 1243 | /* | ||
| 1244 | * Reset the scan period if the task is being rescheduled on an | ||
| 1245 | * alternative node to recheck if the tasks is now properly placed. | ||
| 1246 | */ | ||
| 1247 | p->numa_scan_period = task_scan_min(p); | ||
| 1248 | |||
| 1249 | if (env.best_task == NULL) { | ||
| 1250 | int ret = migrate_task_to(p, env.best_cpu); | ||
| 1251 | return ret; | ||
| 1252 | } | ||
| 1253 | |||
| 1254 | ret = migrate_swap(p, env.best_task); | ||
| 1255 | put_task_struct(env.best_task); | ||
| 1256 | return ret; | ||
| 1257 | } | ||
| 1258 | |||
| 1259 | /* Attempt to migrate a task to a CPU on the preferred node. */ | ||
| 1260 | static void numa_migrate_preferred(struct task_struct *p) | ||
| 1261 | { | ||
| 1262 | /* This task has no NUMA fault statistics yet */ | ||
| 1263 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) | ||
| 1264 | return; | ||
| 1265 | |||
| 1266 | /* Periodically retry migrating the task to the preferred node */ | ||
| 1267 | p->numa_migrate_retry = jiffies + HZ; | ||
| 1268 | |||
| 1269 | /* Success if task is already running on preferred CPU */ | ||
| 1270 | if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) | ||
| 838 | return; | 1271 | return; |
| 1272 | |||
| 1273 | /* Otherwise, try migrate to a CPU on the preferred node */ | ||
| 1274 | task_numa_migrate(p); | ||
| 1275 | } | ||
| 1276 | |||
| 1277 | /* | ||
| 1278 | * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS | ||
| 1279 | * increments. The more local the fault statistics are, the higher the scan | ||
| 1280 | * period will be for the next scan window. If local/remote ratio is below | ||
| 1281 | * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the | ||
| 1282 | * scan period will decrease | ||
| 1283 | */ | ||
| 1284 | #define NUMA_PERIOD_SLOTS 10 | ||
| 1285 | #define NUMA_PERIOD_THRESHOLD 3 | ||
| 1286 | |||
| 1287 | /* | ||
| 1288 | * Increase the scan period (slow down scanning) if the majority of | ||
| 1289 | * our memory is already on our local node, or if the majority of | ||
| 1290 | * the page accesses are shared with other processes. | ||
| 1291 | * Otherwise, decrease the scan period. | ||
| 1292 | */ | ||
| 1293 | static void update_task_scan_period(struct task_struct *p, | ||
| 1294 | unsigned long shared, unsigned long private) | ||
| 1295 | { | ||
| 1296 | unsigned int period_slot; | ||
| 1297 | int ratio; | ||
| 1298 | int diff; | ||
| 1299 | |||
| 1300 | unsigned long remote = p->numa_faults_locality[0]; | ||
| 1301 | unsigned long local = p->numa_faults_locality[1]; | ||
| 1302 | |||
| 1303 | /* | ||
| 1304 | * If there were no record hinting faults then either the task is | ||
| 1305 | * completely idle or all activity is areas that are not of interest | ||
| 1306 | * to automatic numa balancing. Scan slower | ||
| 1307 | */ | ||
| 1308 | if (local + shared == 0) { | ||
| 1309 | p->numa_scan_period = min(p->numa_scan_period_max, | ||
| 1310 | p->numa_scan_period << 1); | ||
| 1311 | |||
| 1312 | p->mm->numa_next_scan = jiffies + | ||
| 1313 | msecs_to_jiffies(p->numa_scan_period); | ||
| 1314 | |||
| 1315 | return; | ||
| 1316 | } | ||
| 1317 | |||
| 1318 | /* | ||
| 1319 | * Prepare to scale scan period relative to the current period. | ||
| 1320 | * == NUMA_PERIOD_THRESHOLD scan period stays the same | ||
| 1321 | * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) | ||
| 1322 | * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) | ||
| 1323 | */ | ||
| 1324 | period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); | ||
| 1325 | ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); | ||
| 1326 | if (ratio >= NUMA_PERIOD_THRESHOLD) { | ||
| 1327 | int slot = ratio - NUMA_PERIOD_THRESHOLD; | ||
| 1328 | if (!slot) | ||
| 1329 | slot = 1; | ||
| 1330 | diff = slot * period_slot; | ||
| 1331 | } else { | ||
| 1332 | diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; | ||
| 1333 | |||
| 1334 | /* | ||
| 1335 | * Scale scan rate increases based on sharing. There is an | ||
| 1336 | * inverse relationship between the degree of sharing and | ||
| 1337 | * the adjustment made to the scanning period. Broadly | ||
| 1338 | * speaking the intent is that there is little point | ||
| 1339 | * scanning faster if shared accesses dominate as it may | ||
| 1340 | * simply bounce migrations uselessly | ||
| 1341 | */ | ||
| 1342 | period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); | ||
| 1343 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); | ||
| 1344 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; | ||
| 1345 | } | ||
| 1346 | |||
| 1347 | p->numa_scan_period = clamp(p->numa_scan_period + diff, | ||
| 1348 | task_scan_min(p), task_scan_max(p)); | ||
| 1349 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | ||
| 1350 | } | ||
| 1351 | |||
| 1352 | static void task_numa_placement(struct task_struct *p) | ||
| 1353 | { | ||
| 1354 | int seq, nid, max_nid = -1, max_group_nid = -1; | ||
| 1355 | unsigned long max_faults = 0, max_group_faults = 0; | ||
| 1356 | unsigned long fault_types[2] = { 0, 0 }; | ||
| 1357 | spinlock_t *group_lock = NULL; | ||
| 1358 | |||
| 839 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); | 1359 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); |
| 840 | if (p->numa_scan_seq == seq) | 1360 | if (p->numa_scan_seq == seq) |
| 841 | return; | 1361 | return; |
| 842 | p->numa_scan_seq = seq; | 1362 | p->numa_scan_seq = seq; |
| 1363 | p->numa_scan_period_max = task_scan_max(p); | ||
| 1364 | |||
| 1365 | /* If the task is part of a group prevent parallel updates to group stats */ | ||
| 1366 | if (p->numa_group) { | ||
| 1367 | group_lock = &p->numa_group->lock; | ||
| 1368 | spin_lock(group_lock); | ||
| 1369 | } | ||
| 1370 | |||
| 1371 | /* Find the node with the highest number of faults */ | ||
| 1372 | for_each_online_node(nid) { | ||
| 1373 | unsigned long faults = 0, group_faults = 0; | ||
| 1374 | int priv, i; | ||
| 1375 | |||
| 1376 | for (priv = 0; priv < 2; priv++) { | ||
| 1377 | long diff; | ||
| 1378 | |||
| 1379 | i = task_faults_idx(nid, priv); | ||
| 1380 | diff = -p->numa_faults[i]; | ||
| 1381 | |||
| 1382 | /* Decay existing window, copy faults since last scan */ | ||
| 1383 | p->numa_faults[i] >>= 1; | ||
| 1384 | p->numa_faults[i] += p->numa_faults_buffer[i]; | ||
| 1385 | fault_types[priv] += p->numa_faults_buffer[i]; | ||
| 1386 | p->numa_faults_buffer[i] = 0; | ||
| 1387 | |||
| 1388 | faults += p->numa_faults[i]; | ||
| 1389 | diff += p->numa_faults[i]; | ||
| 1390 | p->total_numa_faults += diff; | ||
| 1391 | if (p->numa_group) { | ||
| 1392 | /* safe because we can only change our own group */ | ||
| 1393 | p->numa_group->faults[i] += diff; | ||
| 1394 | p->numa_group->total_faults += diff; | ||
| 1395 | group_faults += p->numa_group->faults[i]; | ||
| 1396 | } | ||
| 1397 | } | ||
| 1398 | |||
| 1399 | if (faults > max_faults) { | ||
| 1400 | max_faults = faults; | ||
| 1401 | max_nid = nid; | ||
| 1402 | } | ||
| 1403 | |||
| 1404 | if (group_faults > max_group_faults) { | ||
| 1405 | max_group_faults = group_faults; | ||
| 1406 | max_group_nid = nid; | ||
| 1407 | } | ||
| 1408 | } | ||
| 1409 | |||
| 1410 | update_task_scan_period(p, fault_types[0], fault_types[1]); | ||
| 1411 | |||
| 1412 | if (p->numa_group) { | ||
| 1413 | /* | ||
| 1414 | * If the preferred task and group nids are different, | ||
| 1415 | * iterate over the nodes again to find the best place. | ||
| 1416 | */ | ||
| 1417 | if (max_nid != max_group_nid) { | ||
| 1418 | unsigned long weight, max_weight = 0; | ||
| 1419 | |||
| 1420 | for_each_online_node(nid) { | ||
| 1421 | weight = task_weight(p, nid) + group_weight(p, nid); | ||
| 1422 | if (weight > max_weight) { | ||
| 1423 | max_weight = weight; | ||
| 1424 | max_nid = nid; | ||
| 1425 | } | ||
| 1426 | } | ||
| 1427 | } | ||
| 1428 | |||
| 1429 | spin_unlock(group_lock); | ||
| 1430 | } | ||
| 1431 | |||
| 1432 | /* Preferred node as the node with the most faults */ | ||
| 1433 | if (max_faults && max_nid != p->numa_preferred_nid) { | ||
| 1434 | /* Update the preferred nid and migrate task if possible */ | ||
| 1435 | sched_setnuma(p, max_nid); | ||
| 1436 | numa_migrate_preferred(p); | ||
| 1437 | } | ||
| 1438 | } | ||
| 1439 | |||
| 1440 | static inline int get_numa_group(struct numa_group *grp) | ||
| 1441 | { | ||
| 1442 | return atomic_inc_not_zero(&grp->refcount); | ||
| 1443 | } | ||
| 1444 | |||
| 1445 | static inline void put_numa_group(struct numa_group *grp) | ||
| 1446 | { | ||
| 1447 | if (atomic_dec_and_test(&grp->refcount)) | ||
| 1448 | kfree_rcu(grp, rcu); | ||
| 1449 | } | ||
| 1450 | |||
| 1451 | static void task_numa_group(struct task_struct *p, int cpupid, int flags, | ||
| 1452 | int *priv) | ||
| 1453 | { | ||
| 1454 | struct numa_group *grp, *my_grp; | ||
| 1455 | struct task_struct *tsk; | ||
| 1456 | bool join = false; | ||
| 1457 | int cpu = cpupid_to_cpu(cpupid); | ||
| 1458 | int i; | ||
| 1459 | |||
| 1460 | if (unlikely(!p->numa_group)) { | ||
| 1461 | unsigned int size = sizeof(struct numa_group) + | ||
| 1462 | 2*nr_node_ids*sizeof(unsigned long); | ||
| 1463 | |||
| 1464 | grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); | ||
| 1465 | if (!grp) | ||
| 1466 | return; | ||
| 1467 | |||
| 1468 | atomic_set(&grp->refcount, 1); | ||
| 1469 | spin_lock_init(&grp->lock); | ||
| 1470 | INIT_LIST_HEAD(&grp->task_list); | ||
| 1471 | grp->gid = p->pid; | ||
| 1472 | |||
| 1473 | for (i = 0; i < 2*nr_node_ids; i++) | ||
| 1474 | grp->faults[i] = p->numa_faults[i]; | ||
| 1475 | |||
| 1476 | grp->total_faults = p->total_numa_faults; | ||
| 1477 | |||
| 1478 | list_add(&p->numa_entry, &grp->task_list); | ||
| 1479 | grp->nr_tasks++; | ||
| 1480 | rcu_assign_pointer(p->numa_group, grp); | ||
| 1481 | } | ||
| 1482 | |||
| 1483 | rcu_read_lock(); | ||
| 1484 | tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); | ||
| 1485 | |||
| 1486 | if (!cpupid_match_pid(tsk, cpupid)) | ||
| 1487 | goto no_join; | ||
| 1488 | |||
| 1489 | grp = rcu_dereference(tsk->numa_group); | ||
| 1490 | if (!grp) | ||
| 1491 | goto no_join; | ||
| 1492 | |||
| 1493 | my_grp = p->numa_group; | ||
| 1494 | if (grp == my_grp) | ||
| 1495 | goto no_join; | ||
| 1496 | |||
| 1497 | /* | ||
| 1498 | * Only join the other group if its bigger; if we're the bigger group, | ||
| 1499 | * the other task will join us. | ||
| 1500 | */ | ||
| 1501 | if (my_grp->nr_tasks > grp->nr_tasks) | ||
| 1502 | goto no_join; | ||
| 1503 | |||
| 1504 | /* | ||
| 1505 | * Tie-break on the grp address. | ||
| 1506 | */ | ||
| 1507 | if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) | ||
| 1508 | goto no_join; | ||
| 1509 | |||
| 1510 | /* Always join threads in the same process. */ | ||
| 1511 | if (tsk->mm == current->mm) | ||
| 1512 | join = true; | ||
| 1513 | |||
| 1514 | /* Simple filter to avoid false positives due to PID collisions */ | ||
| 1515 | if (flags & TNF_SHARED) | ||
| 1516 | join = true; | ||
| 1517 | |||
| 1518 | /* Update priv based on whether false sharing was detected */ | ||
| 1519 | *priv = !join; | ||
| 1520 | |||
| 1521 | if (join && !get_numa_group(grp)) | ||
| 1522 | goto no_join; | ||
| 843 | 1523 | ||
| 844 | /* FIXME: Scheduling placement policy hints go here */ | 1524 | rcu_read_unlock(); |
| 1525 | |||
| 1526 | if (!join) | ||
| 1527 | return; | ||
| 1528 | |||
| 1529 | double_lock(&my_grp->lock, &grp->lock); | ||
| 1530 | |||
| 1531 | for (i = 0; i < 2*nr_node_ids; i++) { | ||
| 1532 | my_grp->faults[i] -= p->numa_faults[i]; | ||
| 1533 | grp->faults[i] += p->numa_faults[i]; | ||
| 1534 | } | ||
| 1535 | my_grp->total_faults -= p->total_numa_faults; | ||
| 1536 | grp->total_faults += p->total_numa_faults; | ||
| 1537 | |||
| 1538 | list_move(&p->numa_entry, &grp->task_list); | ||
| 1539 | my_grp->nr_tasks--; | ||
| 1540 | grp->nr_tasks++; | ||
| 1541 | |||
| 1542 | spin_unlock(&my_grp->lock); | ||
| 1543 | spin_unlock(&grp->lock); | ||
| 1544 | |||
| 1545 | rcu_assign_pointer(p->numa_group, grp); | ||
| 1546 | |||
| 1547 | put_numa_group(my_grp); | ||
| 1548 | return; | ||
| 1549 | |||
| 1550 | no_join: | ||
| 1551 | rcu_read_unlock(); | ||
| 1552 | return; | ||
| 1553 | } | ||
| 1554 | |||
| 1555 | void task_numa_free(struct task_struct *p) | ||
| 1556 | { | ||
| 1557 | struct numa_group *grp = p->numa_group; | ||
| 1558 | int i; | ||
| 1559 | void *numa_faults = p->numa_faults; | ||
| 1560 | |||
| 1561 | if (grp) { | ||
| 1562 | spin_lock(&grp->lock); | ||
| 1563 | for (i = 0; i < 2*nr_node_ids; i++) | ||
| 1564 | grp->faults[i] -= p->numa_faults[i]; | ||
| 1565 | grp->total_faults -= p->total_numa_faults; | ||
| 1566 | |||
| 1567 | list_del(&p->numa_entry); | ||
| 1568 | grp->nr_tasks--; | ||
| 1569 | spin_unlock(&grp->lock); | ||
| 1570 | rcu_assign_pointer(p->numa_group, NULL); | ||
| 1571 | put_numa_group(grp); | ||
| 1572 | } | ||
| 1573 | |||
| 1574 | p->numa_faults = NULL; | ||
| 1575 | p->numa_faults_buffer = NULL; | ||
| 1576 | kfree(numa_faults); | ||
| 845 | } | 1577 | } |
| 846 | 1578 | ||
| 847 | /* | 1579 | /* |
| 848 | * Got a PROT_NONE fault for a page on @node. | 1580 | * Got a PROT_NONE fault for a page on @node. |
| 849 | */ | 1581 | */ |
| 850 | void task_numa_fault(int node, int pages, bool migrated) | 1582 | void task_numa_fault(int last_cpupid, int node, int pages, int flags) |
| 851 | { | 1583 | { |
| 852 | struct task_struct *p = current; | 1584 | struct task_struct *p = current; |
| 1585 | bool migrated = flags & TNF_MIGRATED; | ||
| 1586 | int priv; | ||
| 853 | 1587 | ||
| 854 | if (!numabalancing_enabled) | 1588 | if (!numabalancing_enabled) |
| 855 | return; | 1589 | return; |
| 856 | 1590 | ||
| 857 | /* FIXME: Allocate task-specific structure for placement policy here */ | 1591 | /* for example, ksmd faulting in a user's mm */ |
| 1592 | if (!p->mm) | ||
| 1593 | return; | ||
| 1594 | |||
| 1595 | /* Do not worry about placement if exiting */ | ||
| 1596 | if (p->state == TASK_DEAD) | ||
| 1597 | return; | ||
| 1598 | |||
| 1599 | /* Allocate buffer to track faults on a per-node basis */ | ||
| 1600 | if (unlikely(!p->numa_faults)) { | ||
| 1601 | int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; | ||
| 1602 | |||
| 1603 | /* numa_faults and numa_faults_buffer share the allocation */ | ||
| 1604 | p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); | ||
| 1605 | if (!p->numa_faults) | ||
| 1606 | return; | ||
| 1607 | |||
| 1608 | BUG_ON(p->numa_faults_buffer); | ||
| 1609 | p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); | ||
| 1610 | p->total_numa_faults = 0; | ||
| 1611 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | ||
| 1612 | } | ||
| 858 | 1613 | ||
| 859 | /* | 1614 | /* |
| 860 | * If pages are properly placed (did not migrate) then scan slower. | 1615 | * First accesses are treated as private, otherwise consider accesses |
| 861 | * This is reset periodically in case of phase changes | 1616 | * to be private if the accessing pid has not changed |
| 862 | */ | 1617 | */ |
| 863 | if (!migrated) | 1618 | if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { |
| 864 | p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, | 1619 | priv = 1; |
| 865 | p->numa_scan_period + jiffies_to_msecs(10)); | 1620 | } else { |
| 1621 | priv = cpupid_match_pid(p, last_cpupid); | ||
| 1622 | if (!priv && !(flags & TNF_NO_GROUP)) | ||
| 1623 | task_numa_group(p, last_cpupid, flags, &priv); | ||
| 1624 | } | ||
| 866 | 1625 | ||
| 867 | task_numa_placement(p); | 1626 | task_numa_placement(p); |
| 1627 | |||
| 1628 | /* | ||
| 1629 | * Retry task to preferred node migration periodically, in case it | ||
| 1630 | * case it previously failed, or the scheduler moved us. | ||
| 1631 | */ | ||
| 1632 | if (time_after(jiffies, p->numa_migrate_retry)) | ||
| 1633 | numa_migrate_preferred(p); | ||
| 1634 | |||
| 1635 | if (migrated) | ||
| 1636 | p->numa_pages_migrated += pages; | ||
| 1637 | |||
| 1638 | p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; | ||
| 1639 | p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; | ||
| 868 | } | 1640 | } |
| 869 | 1641 | ||
| 870 | static void reset_ptenuma_scan(struct task_struct *p) | 1642 | static void reset_ptenuma_scan(struct task_struct *p) |
| @@ -884,6 +1656,7 @@ void task_numa_work(struct callback_head *work) | |||
| 884 | struct mm_struct *mm = p->mm; | 1656 | struct mm_struct *mm = p->mm; |
| 885 | struct vm_area_struct *vma; | 1657 | struct vm_area_struct *vma; |
| 886 | unsigned long start, end; | 1658 | unsigned long start, end; |
| 1659 | unsigned long nr_pte_updates = 0; | ||
| 887 | long pages; | 1660 | long pages; |
| 888 | 1661 | ||
| 889 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | 1662 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); |
| @@ -900,35 +1673,9 @@ void task_numa_work(struct callback_head *work) | |||
| 900 | if (p->flags & PF_EXITING) | 1673 | if (p->flags & PF_EXITING) |
| 901 | return; | 1674 | return; |
| 902 | 1675 | ||
| 903 | /* | 1676 | if (!mm->numa_next_scan) { |
| 904 | * We do not care about task placement until a task runs on a node | 1677 | mm->numa_next_scan = now + |
| 905 | * other than the first one used by the address space. This is | 1678 | msecs_to_jiffies(sysctl_numa_balancing_scan_delay); |
| 906 | * largely because migrations are driven by what CPU the task | ||
| 907 | * is running on. If it's never scheduled on another node, it'll | ||
| 908 | * not migrate so why bother trapping the fault. | ||
| 909 | */ | ||
| 910 | if (mm->first_nid == NUMA_PTE_SCAN_INIT) | ||
| 911 | mm->first_nid = numa_node_id(); | ||
| 912 | if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { | ||
| 913 | /* Are we running on a new node yet? */ | ||
| 914 | if (numa_node_id() == mm->first_nid && | ||
| 915 | !sched_feat_numa(NUMA_FORCE)) | ||
| 916 | return; | ||
| 917 | |||
| 918 | mm->first_nid = NUMA_PTE_SCAN_ACTIVE; | ||
| 919 | } | ||
| 920 | |||
| 921 | /* | ||
| 922 | * Reset the scan period if enough time has gone by. Objective is that | ||
| 923 | * scanning will be reduced if pages are properly placed. As tasks | ||
| 924 | * can enter different phases this needs to be re-examined. Lacking | ||
| 925 | * proper tracking of reference behaviour, this blunt hammer is used. | ||
| 926 | */ | ||
| 927 | migrate = mm->numa_next_reset; | ||
| 928 | if (time_after(now, migrate)) { | ||
| 929 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
| 930 | next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); | ||
| 931 | xchg(&mm->numa_next_reset, next_scan); | ||
| 932 | } | 1679 | } |
| 933 | 1680 | ||
| 934 | /* | 1681 | /* |
| @@ -938,20 +1685,20 @@ void task_numa_work(struct callback_head *work) | |||
| 938 | if (time_before(now, migrate)) | 1685 | if (time_before(now, migrate)) |
| 939 | return; | 1686 | return; |
| 940 | 1687 | ||
| 941 | if (p->numa_scan_period == 0) | 1688 | if (p->numa_scan_period == 0) { |
| 942 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | 1689 | p->numa_scan_period_max = task_scan_max(p); |
| 1690 | p->numa_scan_period = task_scan_min(p); | ||
| 1691 | } | ||
| 943 | 1692 | ||
| 944 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); | 1693 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); |
| 945 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) | 1694 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) |
| 946 | return; | 1695 | return; |
| 947 | 1696 | ||
| 948 | /* | 1697 | /* |
| 949 | * Do not set pte_numa if the current running node is rate-limited. | 1698 | * Delay this task enough that another task of this mm will likely win |
| 950 | * This loses statistics on the fault but if we are unwilling to | 1699 | * the next time around. |
| 951 | * migrate to this node, it is less likely we can do useful work | ||
| 952 | */ | 1700 | */ |
| 953 | if (migrate_ratelimited(numa_node_id())) | 1701 | p->node_stamp += 2 * TICK_NSEC; |
| 954 | return; | ||
| 955 | 1702 | ||
| 956 | start = mm->numa_scan_offset; | 1703 | start = mm->numa_scan_offset; |
| 957 | pages = sysctl_numa_balancing_scan_size; | 1704 | pages = sysctl_numa_balancing_scan_size; |
| @@ -967,18 +1714,32 @@ void task_numa_work(struct callback_head *work) | |||
| 967 | vma = mm->mmap; | 1714 | vma = mm->mmap; |
| 968 | } | 1715 | } |
| 969 | for (; vma; vma = vma->vm_next) { | 1716 | for (; vma; vma = vma->vm_next) { |
| 970 | if (!vma_migratable(vma)) | 1717 | if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) |
| 971 | continue; | 1718 | continue; |
| 972 | 1719 | ||
| 973 | /* Skip small VMAs. They are not likely to be of relevance */ | 1720 | /* |
| 974 | if (vma->vm_end - vma->vm_start < HPAGE_SIZE) | 1721 | * Shared library pages mapped by multiple processes are not |
| 1722 | * migrated as it is expected they are cache replicated. Avoid | ||
| 1723 | * hinting faults in read-only file-backed mappings or the vdso | ||
| 1724 | * as migrating the pages will be of marginal benefit. | ||
| 1725 | */ | ||
| 1726 | if (!vma->vm_mm || | ||
| 1727 | (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) | ||
| 975 | continue; | 1728 | continue; |
| 976 | 1729 | ||
| 977 | do { | 1730 | do { |
| 978 | start = max(start, vma->vm_start); | 1731 | start = max(start, vma->vm_start); |
| 979 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); | 1732 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); |
| 980 | end = min(end, vma->vm_end); | 1733 | end = min(end, vma->vm_end); |
| 981 | pages -= change_prot_numa(vma, start, end); | 1734 | nr_pte_updates += change_prot_numa(vma, start, end); |
| 1735 | |||
| 1736 | /* | ||
| 1737 | * Scan sysctl_numa_balancing_scan_size but ensure that | ||
| 1738 | * at least one PTE is updated so that unused virtual | ||
| 1739 | * address space is quickly skipped. | ||
| 1740 | */ | ||
| 1741 | if (nr_pte_updates) | ||
| 1742 | pages -= (end - start) >> PAGE_SHIFT; | ||
| 982 | 1743 | ||
| 983 | start = end; | 1744 | start = end; |
| 984 | if (pages <= 0) | 1745 | if (pages <= 0) |
| @@ -988,10 +1749,10 @@ void task_numa_work(struct callback_head *work) | |||
| 988 | 1749 | ||
| 989 | out: | 1750 | out: |
| 990 | /* | 1751 | /* |
| 991 | * It is possible to reach the end of the VMA list but the last few VMAs are | 1752 | * It is possible to reach the end of the VMA list but the last few |
| 992 | * not guaranteed to the vma_migratable. If they are not, we would find the | 1753 | * VMAs are not guaranteed to the vma_migratable. If they are not, we |
| 993 | * !migratable VMA on the next scan but not reset the scanner to the start | 1754 | * would find the !migratable VMA on the next scan but not reset the |
| 994 | * so check it now. | 1755 | * scanner to the start so check it now. |
| 995 | */ | 1756 | */ |
| 996 | if (vma) | 1757 | if (vma) |
| 997 | mm->numa_scan_offset = start; | 1758 | mm->numa_scan_offset = start; |
| @@ -1025,8 +1786,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) | |||
| 1025 | 1786 | ||
| 1026 | if (now - curr->node_stamp > period) { | 1787 | if (now - curr->node_stamp > period) { |
| 1027 | if (!curr->node_stamp) | 1788 | if (!curr->node_stamp) |
| 1028 | curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; | 1789 | curr->numa_scan_period = task_scan_min(curr); |
| 1029 | curr->node_stamp = now; | 1790 | curr->node_stamp += period; |
| 1030 | 1791 | ||
| 1031 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { | 1792 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { |
| 1032 | init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ | 1793 | init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ |
| @@ -1038,6 +1799,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) | |||
| 1038 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) | 1799 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) |
| 1039 | { | 1800 | { |
| 1040 | } | 1801 | } |
| 1802 | |||
| 1803 | static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) | ||
| 1804 | { | ||
| 1805 | } | ||
| 1806 | |||
| 1807 | static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) | ||
| 1808 | { | ||
| 1809 | } | ||
| 1041 | #endif /* CONFIG_NUMA_BALANCING */ | 1810 | #endif /* CONFIG_NUMA_BALANCING */ |
| 1042 | 1811 | ||
| 1043 | static void | 1812 | static void |
| @@ -1047,8 +1816,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 1047 | if (!parent_entity(se)) | 1816 | if (!parent_entity(se)) |
| 1048 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | 1817 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
| 1049 | #ifdef CONFIG_SMP | 1818 | #ifdef CONFIG_SMP |
| 1050 | if (entity_is_task(se)) | 1819 | if (entity_is_task(se)) { |
| 1051 | list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); | 1820 | struct rq *rq = rq_of(cfs_rq); |
| 1821 | |||
| 1822 | account_numa_enqueue(rq, task_of(se)); | ||
| 1823 | list_add(&se->group_node, &rq->cfs_tasks); | ||
| 1824 | } | ||
| 1052 | #endif | 1825 | #endif |
| 1053 | cfs_rq->nr_running++; | 1826 | cfs_rq->nr_running++; |
| 1054 | } | 1827 | } |
| @@ -1059,8 +1832,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 1059 | update_load_sub(&cfs_rq->load, se->load.weight); | 1832 | update_load_sub(&cfs_rq->load, se->load.weight); |
| 1060 | if (!parent_entity(se)) | 1833 | if (!parent_entity(se)) |
| 1061 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); | 1834 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); |
| 1062 | if (entity_is_task(se)) | 1835 | if (entity_is_task(se)) { |
| 1836 | account_numa_dequeue(rq_of(cfs_rq), task_of(se)); | ||
| 1063 | list_del_init(&se->group_node); | 1837 | list_del_init(&se->group_node); |
| 1838 | } | ||
| 1064 | cfs_rq->nr_running--; | 1839 | cfs_rq->nr_running--; |
| 1065 | } | 1840 | } |
| 1066 | 1841 | ||
| @@ -2070,13 +2845,14 @@ static inline bool cfs_bandwidth_used(void) | |||
| 2070 | return static_key_false(&__cfs_bandwidth_used); | 2845 | return static_key_false(&__cfs_bandwidth_used); |
| 2071 | } | 2846 | } |
| 2072 | 2847 | ||
| 2073 | void account_cfs_bandwidth_used(int enabled, int was_enabled) | 2848 | void cfs_bandwidth_usage_inc(void) |
| 2849 | { | ||
| 2850 | static_key_slow_inc(&__cfs_bandwidth_used); | ||
| 2851 | } | ||
| 2852 | |||
| 2853 | void cfs_bandwidth_usage_dec(void) | ||
| 2074 | { | 2854 | { |
| 2075 | /* only need to count groups transitioning between enabled/!enabled */ | 2855 | static_key_slow_dec(&__cfs_bandwidth_used); |
| 2076 | if (enabled && !was_enabled) | ||
| 2077 | static_key_slow_inc(&__cfs_bandwidth_used); | ||
| 2078 | else if (!enabled && was_enabled) | ||
| 2079 | static_key_slow_dec(&__cfs_bandwidth_used); | ||
| 2080 | } | 2856 | } |
| 2081 | #else /* HAVE_JUMP_LABEL */ | 2857 | #else /* HAVE_JUMP_LABEL */ |
| 2082 | static bool cfs_bandwidth_used(void) | 2858 | static bool cfs_bandwidth_used(void) |
| @@ -2084,7 +2860,8 @@ static bool cfs_bandwidth_used(void) | |||
| 2084 | return true; | 2860 | return true; |
| 2085 | } | 2861 | } |
| 2086 | 2862 | ||
| 2087 | void account_cfs_bandwidth_used(int enabled, int was_enabled) {} | 2863 | void cfs_bandwidth_usage_inc(void) {} |
| 2864 | void cfs_bandwidth_usage_dec(void) {} | ||
| 2088 | #endif /* HAVE_JUMP_LABEL */ | 2865 | #endif /* HAVE_JUMP_LABEL */ |
| 2089 | 2866 | ||
| 2090 | /* | 2867 | /* |
| @@ -2335,6 +3112,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 2335 | cfs_rq->throttled_clock = rq_clock(rq); | 3112 | cfs_rq->throttled_clock = rq_clock(rq); |
| 2336 | raw_spin_lock(&cfs_b->lock); | 3113 | raw_spin_lock(&cfs_b->lock); |
| 2337 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | 3114 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); |
| 3115 | if (!cfs_b->timer_active) | ||
| 3116 | __start_cfs_bandwidth(cfs_b); | ||
| 2338 | raw_spin_unlock(&cfs_b->lock); | 3117 | raw_spin_unlock(&cfs_b->lock); |
| 2339 | } | 3118 | } |
| 2340 | 3119 | ||
| @@ -2448,6 +3227,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | |||
| 2448 | if (idle) | 3227 | if (idle) |
| 2449 | goto out_unlock; | 3228 | goto out_unlock; |
| 2450 | 3229 | ||
| 3230 | /* | ||
| 3231 | * if we have relooped after returning idle once, we need to update our | ||
| 3232 | * status as actually running, so that other cpus doing | ||
| 3233 | * __start_cfs_bandwidth will stop trying to cancel us. | ||
| 3234 | */ | ||
| 3235 | cfs_b->timer_active = 1; | ||
| 3236 | |||
| 2451 | __refill_cfs_bandwidth_runtime(cfs_b); | 3237 | __refill_cfs_bandwidth_runtime(cfs_b); |
| 2452 | 3238 | ||
| 2453 | if (!throttled) { | 3239 | if (!throttled) { |
| @@ -2508,7 +3294,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; | |||
| 2508 | /* how long we wait to gather additional slack before distributing */ | 3294 | /* how long we wait to gather additional slack before distributing */ |
| 2509 | static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; | 3295 | static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; |
| 2510 | 3296 | ||
| 2511 | /* are we near the end of the current quota period? */ | 3297 | /* |
| 3298 | * Are we near the end of the current quota period? | ||
| 3299 | * | ||
| 3300 | * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the | ||
| 3301 | * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of | ||
| 3302 | * migrate_hrtimers, base is never cleared, so we are fine. | ||
| 3303 | */ | ||
| 2512 | static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) | 3304 | static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) |
| 2513 | { | 3305 | { |
| 2514 | struct hrtimer *refresh_timer = &cfs_b->period_timer; | 3306 | struct hrtimer *refresh_timer = &cfs_b->period_timer; |
| @@ -2584,10 +3376,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | |||
| 2584 | u64 expires; | 3376 | u64 expires; |
| 2585 | 3377 | ||
| 2586 | /* confirm we're still not at a refresh boundary */ | 3378 | /* confirm we're still not at a refresh boundary */ |
| 2587 | if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) | 3379 | raw_spin_lock(&cfs_b->lock); |
| 3380 | if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { | ||
| 3381 | raw_spin_unlock(&cfs_b->lock); | ||
| 2588 | return; | 3382 | return; |
| 3383 | } | ||
| 2589 | 3384 | ||
| 2590 | raw_spin_lock(&cfs_b->lock); | ||
| 2591 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { | 3385 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { |
| 2592 | runtime = cfs_b->runtime; | 3386 | runtime = cfs_b->runtime; |
| 2593 | cfs_b->runtime = 0; | 3387 | cfs_b->runtime = 0; |
| @@ -2708,11 +3502,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
| 2708 | * (timer_active==0 becomes visible before the hrtimer call-back | 3502 | * (timer_active==0 becomes visible before the hrtimer call-back |
| 2709 | * terminates). In either case we ensure that it's re-programmed | 3503 | * terminates). In either case we ensure that it's re-programmed |
| 2710 | */ | 3504 | */ |
| 2711 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | 3505 | while (unlikely(hrtimer_active(&cfs_b->period_timer)) && |
| 3506 | hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { | ||
| 3507 | /* bounce the lock to allow do_sched_cfs_period_timer to run */ | ||
| 2712 | raw_spin_unlock(&cfs_b->lock); | 3508 | raw_spin_unlock(&cfs_b->lock); |
| 2713 | /* ensure cfs_b->lock is available while we wait */ | 3509 | cpu_relax(); |
| 2714 | hrtimer_cancel(&cfs_b->period_timer); | ||
| 2715 | |||
| 2716 | raw_spin_lock(&cfs_b->lock); | 3510 | raw_spin_lock(&cfs_b->lock); |
| 2717 | /* if someone else restarted the timer then we're done */ | 3511 | /* if someone else restarted the timer then we're done */ |
| 2718 | if (cfs_b->timer_active) | 3512 | if (cfs_b->timer_active) |
| @@ -3113,7 +3907,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
| 3113 | { | 3907 | { |
| 3114 | struct sched_entity *se = tg->se[cpu]; | 3908 | struct sched_entity *se = tg->se[cpu]; |
| 3115 | 3909 | ||
| 3116 | if (!tg->parent) /* the trivial, non-cgroup case */ | 3910 | if (!tg->parent || !wl) /* the trivial, non-cgroup case */ |
| 3117 | return wl; | 3911 | return wl; |
| 3118 | 3912 | ||
| 3119 | for_each_sched_entity(se) { | 3913 | for_each_sched_entity(se) { |
| @@ -3166,8 +3960,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
| 3166 | } | 3960 | } |
| 3167 | #else | 3961 | #else |
| 3168 | 3962 | ||
| 3169 | static inline unsigned long effective_load(struct task_group *tg, int cpu, | 3963 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
| 3170 | unsigned long wl, unsigned long wg) | ||
| 3171 | { | 3964 | { |
| 3172 | return wl; | 3965 | return wl; |
| 3173 | } | 3966 | } |
| @@ -3420,11 +4213,10 @@ done: | |||
| 3420 | * preempt must be disabled. | 4213 | * preempt must be disabled. |
| 3421 | */ | 4214 | */ |
| 3422 | static int | 4215 | static int |
| 3423 | select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | 4216 | select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) |
| 3424 | { | 4217 | { |
| 3425 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; | 4218 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
| 3426 | int cpu = smp_processor_id(); | 4219 | int cpu = smp_processor_id(); |
| 3427 | int prev_cpu = task_cpu(p); | ||
| 3428 | int new_cpu = cpu; | 4220 | int new_cpu = cpu; |
| 3429 | int want_affine = 0; | 4221 | int want_affine = 0; |
| 3430 | int sync = wake_flags & WF_SYNC; | 4222 | int sync = wake_flags & WF_SYNC; |
| @@ -3904,9 +4696,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
| 3904 | 4696 | ||
| 3905 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | 4697 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; |
| 3906 | 4698 | ||
| 4699 | enum fbq_type { regular, remote, all }; | ||
| 4700 | |||
| 3907 | #define LBF_ALL_PINNED 0x01 | 4701 | #define LBF_ALL_PINNED 0x01 |
| 3908 | #define LBF_NEED_BREAK 0x02 | 4702 | #define LBF_NEED_BREAK 0x02 |
| 3909 | #define LBF_SOME_PINNED 0x04 | 4703 | #define LBF_DST_PINNED 0x04 |
| 4704 | #define LBF_SOME_PINNED 0x08 | ||
| 3910 | 4705 | ||
| 3911 | struct lb_env { | 4706 | struct lb_env { |
| 3912 | struct sched_domain *sd; | 4707 | struct sched_domain *sd; |
| @@ -3929,6 +4724,8 @@ struct lb_env { | |||
| 3929 | unsigned int loop; | 4724 | unsigned int loop; |
| 3930 | unsigned int loop_break; | 4725 | unsigned int loop_break; |
| 3931 | unsigned int loop_max; | 4726 | unsigned int loop_max; |
| 4727 | |||
| 4728 | enum fbq_type fbq_type; | ||
| 3932 | }; | 4729 | }; |
| 3933 | 4730 | ||
| 3934 | /* | 4731 | /* |
| @@ -3975,6 +4772,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
| 3975 | return delta < (s64)sysctl_sched_migration_cost; | 4772 | return delta < (s64)sysctl_sched_migration_cost; |
| 3976 | } | 4773 | } |
| 3977 | 4774 | ||
| 4775 | #ifdef CONFIG_NUMA_BALANCING | ||
| 4776 | /* Returns true if the destination node has incurred more faults */ | ||
| 4777 | static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | ||
| 4778 | { | ||
| 4779 | int src_nid, dst_nid; | ||
| 4780 | |||
| 4781 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || | ||
| 4782 | !(env->sd->flags & SD_NUMA)) { | ||
| 4783 | return false; | ||
| 4784 | } | ||
| 4785 | |||
| 4786 | src_nid = cpu_to_node(env->src_cpu); | ||
| 4787 | dst_nid = cpu_to_node(env->dst_cpu); | ||
| 4788 | |||
| 4789 | if (src_nid == dst_nid) | ||
| 4790 | return false; | ||
| 4791 | |||
| 4792 | /* Always encourage migration to the preferred node. */ | ||
| 4793 | if (dst_nid == p->numa_preferred_nid) | ||
| 4794 | return true; | ||
| 4795 | |||
| 4796 | /* If both task and group weight improve, this move is a winner. */ | ||
| 4797 | if (task_weight(p, dst_nid) > task_weight(p, src_nid) && | ||
| 4798 | group_weight(p, dst_nid) > group_weight(p, src_nid)) | ||
| 4799 | return true; | ||
| 4800 | |||
| 4801 | return false; | ||
| 4802 | } | ||
| 4803 | |||
| 4804 | |||
| 4805 | static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | ||
| 4806 | { | ||
| 4807 | int src_nid, dst_nid; | ||
| 4808 | |||
| 4809 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) | ||
| 4810 | return false; | ||
| 4811 | |||
| 4812 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) | ||
| 4813 | return false; | ||
| 4814 | |||
| 4815 | src_nid = cpu_to_node(env->src_cpu); | ||
| 4816 | dst_nid = cpu_to_node(env->dst_cpu); | ||
| 4817 | |||
| 4818 | if (src_nid == dst_nid) | ||
| 4819 | return false; | ||
| 4820 | |||
| 4821 | /* Migrating away from the preferred node is always bad. */ | ||
| 4822 | if (src_nid == p->numa_preferred_nid) | ||
| 4823 | return true; | ||
| 4824 | |||
| 4825 | /* If either task or group weight get worse, don't do it. */ | ||
| 4826 | if (task_weight(p, dst_nid) < task_weight(p, src_nid) || | ||
| 4827 | group_weight(p, dst_nid) < group_weight(p, src_nid)) | ||
| 4828 | return true; | ||
| 4829 | |||
| 4830 | return false; | ||
| 4831 | } | ||
| 4832 | |||
| 4833 | #else | ||
| 4834 | static inline bool migrate_improves_locality(struct task_struct *p, | ||
| 4835 | struct lb_env *env) | ||
| 4836 | { | ||
| 4837 | return false; | ||
| 4838 | } | ||
| 4839 | |||
| 4840 | static inline bool migrate_degrades_locality(struct task_struct *p, | ||
| 4841 | struct lb_env *env) | ||
| 4842 | { | ||
| 4843 | return false; | ||
| 4844 | } | ||
| 4845 | #endif | ||
| 4846 | |||
| 3978 | /* | 4847 | /* |
| 3979 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 4848 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
| 3980 | */ | 4849 | */ |
| @@ -3997,6 +4866,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 3997 | 4866 | ||
| 3998 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 4867 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
| 3999 | 4868 | ||
| 4869 | env->flags |= LBF_SOME_PINNED; | ||
| 4870 | |||
| 4000 | /* | 4871 | /* |
| 4001 | * Remember if this task can be migrated to any other cpu in | 4872 | * Remember if this task can be migrated to any other cpu in |
| 4002 | * our sched_group. We may want to revisit it if we couldn't | 4873 | * our sched_group. We may want to revisit it if we couldn't |
| @@ -4005,13 +4876,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 4005 | * Also avoid computing new_dst_cpu if we have already computed | 4876 | * Also avoid computing new_dst_cpu if we have already computed |
| 4006 | * one in current iteration. | 4877 | * one in current iteration. |
| 4007 | */ | 4878 | */ |
| 4008 | if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) | 4879 | if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) |
| 4009 | return 0; | 4880 | return 0; |
| 4010 | 4881 | ||
| 4011 | /* Prevent to re-select dst_cpu via env's cpus */ | 4882 | /* Prevent to re-select dst_cpu via env's cpus */ |
| 4012 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { | 4883 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { |
| 4013 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { | 4884 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { |
| 4014 | env->flags |= LBF_SOME_PINNED; | 4885 | env->flags |= LBF_DST_PINNED; |
| 4015 | env->new_dst_cpu = cpu; | 4886 | env->new_dst_cpu = cpu; |
| 4016 | break; | 4887 | break; |
| 4017 | } | 4888 | } |
| @@ -4030,11 +4901,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 4030 | 4901 | ||
| 4031 | /* | 4902 | /* |
| 4032 | * Aggressive migration if: | 4903 | * Aggressive migration if: |
| 4033 | * 1) task is cache cold, or | 4904 | * 1) destination numa is preferred |
| 4034 | * 2) too many balance attempts have failed. | 4905 | * 2) task is cache cold, or |
| 4906 | * 3) too many balance attempts have failed. | ||
| 4035 | */ | 4907 | */ |
| 4036 | |||
| 4037 | tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); | 4908 | tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); |
| 4909 | if (!tsk_cache_hot) | ||
| 4910 | tsk_cache_hot = migrate_degrades_locality(p, env); | ||
| 4911 | |||
| 4912 | if (migrate_improves_locality(p, env)) { | ||
| 4913 | #ifdef CONFIG_SCHEDSTATS | ||
| 4914 | if (tsk_cache_hot) { | ||
| 4915 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | ||
| 4916 | schedstat_inc(p, se.statistics.nr_forced_migrations); | ||
| 4917 | } | ||
| 4918 | #endif | ||
| 4919 | return 1; | ||
| 4920 | } | ||
| 4921 | |||
| 4038 | if (!tsk_cache_hot || | 4922 | if (!tsk_cache_hot || |
| 4039 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | 4923 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
| 4040 | 4924 | ||
| @@ -4077,8 +4961,6 @@ static int move_one_task(struct lb_env *env) | |||
| 4077 | return 0; | 4961 | return 0; |
| 4078 | } | 4962 | } |
| 4079 | 4963 | ||
| 4080 | static unsigned long task_h_load(struct task_struct *p); | ||
| 4081 | |||
| 4082 | static const unsigned int sched_nr_migrate_break = 32; | 4964 | static const unsigned int sched_nr_migrate_break = 32; |
| 4083 | 4965 | ||
| 4084 | /* | 4966 | /* |
| @@ -4291,6 +5173,10 @@ struct sg_lb_stats { | |||
| 4291 | unsigned int group_weight; | 5173 | unsigned int group_weight; |
| 4292 | int group_imb; /* Is there an imbalance in the group ? */ | 5174 | int group_imb; /* Is there an imbalance in the group ? */ |
| 4293 | int group_has_capacity; /* Is there extra capacity in the group? */ | 5175 | int group_has_capacity; /* Is there extra capacity in the group? */ |
| 5176 | #ifdef CONFIG_NUMA_BALANCING | ||
| 5177 | unsigned int nr_numa_running; | ||
| 5178 | unsigned int nr_preferred_running; | ||
| 5179 | #endif | ||
| 4294 | }; | 5180 | }; |
| 4295 | 5181 | ||
| 4296 | /* | 5182 | /* |
| @@ -4330,7 +5216,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | |||
| 4330 | /** | 5216 | /** |
| 4331 | * get_sd_load_idx - Obtain the load index for a given sched domain. | 5217 | * get_sd_load_idx - Obtain the load index for a given sched domain. |
| 4332 | * @sd: The sched_domain whose load_idx is to be obtained. | 5218 | * @sd: The sched_domain whose load_idx is to be obtained. |
| 4333 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | 5219 | * @idle: The idle status of the CPU for whose sd load_idx is obtained. |
| 4334 | * | 5220 | * |
| 4335 | * Return: The load index. | 5221 | * Return: The load index. |
| 4336 | */ | 5222 | */ |
| @@ -4447,7 +5333,7 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
| 4447 | { | 5333 | { |
| 4448 | struct sched_domain *child = sd->child; | 5334 | struct sched_domain *child = sd->child; |
| 4449 | struct sched_group *group, *sdg = sd->groups; | 5335 | struct sched_group *group, *sdg = sd->groups; |
| 4450 | unsigned long power; | 5336 | unsigned long power, power_orig; |
| 4451 | unsigned long interval; | 5337 | unsigned long interval; |
| 4452 | 5338 | ||
| 4453 | interval = msecs_to_jiffies(sd->balance_interval); | 5339 | interval = msecs_to_jiffies(sd->balance_interval); |
| @@ -4459,7 +5345,7 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
| 4459 | return; | 5345 | return; |
| 4460 | } | 5346 | } |
| 4461 | 5347 | ||
| 4462 | power = 0; | 5348 | power_orig = power = 0; |
| 4463 | 5349 | ||
| 4464 | if (child->flags & SD_OVERLAP) { | 5350 | if (child->flags & SD_OVERLAP) { |
| 4465 | /* | 5351 | /* |
| @@ -4467,8 +5353,12 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
| 4467 | * span the current group. | 5353 | * span the current group. |
| 4468 | */ | 5354 | */ |
| 4469 | 5355 | ||
| 4470 | for_each_cpu(cpu, sched_group_cpus(sdg)) | 5356 | for_each_cpu(cpu, sched_group_cpus(sdg)) { |
| 4471 | power += power_of(cpu); | 5357 | struct sched_group *sg = cpu_rq(cpu)->sd->groups; |
| 5358 | |||
| 5359 | power_orig += sg->sgp->power_orig; | ||
| 5360 | power += sg->sgp->power; | ||
| 5361 | } | ||
| 4472 | } else { | 5362 | } else { |
| 4473 | /* | 5363 | /* |
| 4474 | * !SD_OVERLAP domains can assume that child groups | 5364 | * !SD_OVERLAP domains can assume that child groups |
| @@ -4477,12 +5367,14 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
| 4477 | 5367 | ||
| 4478 | group = child->groups; | 5368 | group = child->groups; |
| 4479 | do { | 5369 | do { |
| 5370 | power_orig += group->sgp->power_orig; | ||
| 4480 | power += group->sgp->power; | 5371 | power += group->sgp->power; |
| 4481 | group = group->next; | 5372 | group = group->next; |
| 4482 | } while (group != child->groups); | 5373 | } while (group != child->groups); |
| 4483 | } | 5374 | } |
| 4484 | 5375 | ||
| 4485 | sdg->sgp->power_orig = sdg->sgp->power = power; | 5376 | sdg->sgp->power_orig = power_orig; |
| 5377 | sdg->sgp->power = power; | ||
| 4486 | } | 5378 | } |
| 4487 | 5379 | ||
| 4488 | /* | 5380 | /* |
| @@ -4526,13 +5418,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
| 4526 | * cpu 3 and leave one of the cpus in the second group unused. | 5418 | * cpu 3 and leave one of the cpus in the second group unused. |
| 4527 | * | 5419 | * |
| 4528 | * The current solution to this issue is detecting the skew in the first group | 5420 | * The current solution to this issue is detecting the skew in the first group |
| 4529 | * by noticing it has a cpu that is overloaded while the remaining cpus are | 5421 | * by noticing the lower domain failed to reach balance and had difficulty |
| 4530 | * idle -- or rather, there's a distinct imbalance in the cpus; see | 5422 | * moving tasks due to affinity constraints. |
| 4531 | * sg_imbalanced(). | ||
| 4532 | * | 5423 | * |
| 4533 | * When this is so detected; this group becomes a candidate for busiest; see | 5424 | * When this is so detected; this group becomes a candidate for busiest; see |
| 4534 | * update_sd_pick_busiest(). And calculcate_imbalance() and | 5425 | * update_sd_pick_busiest(). And calculate_imbalance() and |
| 4535 | * find_busiest_group() avoid some of the usual balance conditional to allow it | 5426 | * find_busiest_group() avoid some of the usual balance conditions to allow it |
| 4536 | * to create an effective group imbalance. | 5427 | * to create an effective group imbalance. |
| 4537 | * | 5428 | * |
| 4538 | * This is a somewhat tricky proposition since the next run might not find the | 5429 | * This is a somewhat tricky proposition since the next run might not find the |
| @@ -4540,49 +5431,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
| 4540 | * subtle and fragile situation. | 5431 | * subtle and fragile situation. |
| 4541 | */ | 5432 | */ |
| 4542 | 5433 | ||
| 4543 | struct sg_imb_stats { | 5434 | static inline int sg_imbalanced(struct sched_group *group) |
| 4544 | unsigned long max_nr_running, min_nr_running; | ||
| 4545 | unsigned long max_cpu_load, min_cpu_load; | ||
| 4546 | }; | ||
| 4547 | |||
| 4548 | static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) | ||
| 4549 | { | 5435 | { |
| 4550 | sgi->max_cpu_load = sgi->max_nr_running = 0UL; | 5436 | return group->sgp->imbalance; |
| 4551 | sgi->min_cpu_load = sgi->min_nr_running = ~0UL; | ||
| 4552 | } | 5437 | } |
| 4553 | 5438 | ||
| 4554 | static inline void | 5439 | /* |
| 4555 | update_sg_imb_stats(struct sg_imb_stats *sgi, | 5440 | * Compute the group capacity. |
| 4556 | unsigned long load, unsigned long nr_running) | 5441 | * |
| 5442 | * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by | ||
| 5443 | * first dividing out the smt factor and computing the actual number of cores | ||
| 5444 | * and limit power unit capacity with that. | ||
| 5445 | */ | ||
| 5446 | static inline int sg_capacity(struct lb_env *env, struct sched_group *group) | ||
| 4557 | { | 5447 | { |
| 4558 | if (load > sgi->max_cpu_load) | 5448 | unsigned int capacity, smt, cpus; |
| 4559 | sgi->max_cpu_load = load; | 5449 | unsigned int power, power_orig; |
| 4560 | if (sgi->min_cpu_load > load) | ||
| 4561 | sgi->min_cpu_load = load; | ||
| 4562 | 5450 | ||
| 4563 | if (nr_running > sgi->max_nr_running) | 5451 | power = group->sgp->power; |
| 4564 | sgi->max_nr_running = nr_running; | 5452 | power_orig = group->sgp->power_orig; |
| 4565 | if (sgi->min_nr_running > nr_running) | 5453 | cpus = group->group_weight; |
| 4566 | sgi->min_nr_running = nr_running; | ||
| 4567 | } | ||
| 4568 | 5454 | ||
| 4569 | static inline int | 5455 | /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ |
| 4570 | sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) | 5456 | smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); |
| 4571 | { | 5457 | capacity = cpus / smt; /* cores */ |
| 4572 | /* | ||
| 4573 | * Consider the group unbalanced when the imbalance is larger | ||
| 4574 | * than the average weight of a task. | ||
| 4575 | * | ||
| 4576 | * APZ: with cgroup the avg task weight can vary wildly and | ||
| 4577 | * might not be a suitable number - should we keep a | ||
| 4578 | * normalized nr_running number somewhere that negates | ||
| 4579 | * the hierarchy? | ||
| 4580 | */ | ||
| 4581 | if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && | ||
| 4582 | (sgi->max_nr_running - sgi->min_nr_running) > 1) | ||
| 4583 | return 1; | ||
| 4584 | 5458 | ||
| 4585 | return 0; | 5459 | capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); |
| 5460 | if (!capacity) | ||
| 5461 | capacity = fix_small_capacity(env->sd, group); | ||
| 5462 | |||
| 5463 | return capacity; | ||
| 4586 | } | 5464 | } |
| 4587 | 5465 | ||
| 4588 | /** | 5466 | /** |
| @@ -4597,12 +5475,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 4597 | struct sched_group *group, int load_idx, | 5475 | struct sched_group *group, int load_idx, |
| 4598 | int local_group, struct sg_lb_stats *sgs) | 5476 | int local_group, struct sg_lb_stats *sgs) |
| 4599 | { | 5477 | { |
| 4600 | struct sg_imb_stats sgi; | ||
| 4601 | unsigned long nr_running; | 5478 | unsigned long nr_running; |
| 4602 | unsigned long load; | 5479 | unsigned long load; |
| 4603 | int i; | 5480 | int i; |
| 4604 | 5481 | ||
| 4605 | init_sg_imb_stats(&sgi); | 5482 | memset(sgs, 0, sizeof(*sgs)); |
| 4606 | 5483 | ||
| 4607 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 5484 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
| 4608 | struct rq *rq = cpu_rq(i); | 5485 | struct rq *rq = cpu_rq(i); |
| @@ -4610,24 +5487,22 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 4610 | nr_running = rq->nr_running; | 5487 | nr_running = rq->nr_running; |
| 4611 | 5488 | ||
| 4612 | /* Bias balancing toward cpus of our domain */ | 5489 | /* Bias balancing toward cpus of our domain */ |
| 4613 | if (local_group) { | 5490 | if (local_group) |
| 4614 | load = target_load(i, load_idx); | 5491 | load = target_load(i, load_idx); |
| 4615 | } else { | 5492 | else |
| 4616 | load = source_load(i, load_idx); | 5493 | load = source_load(i, load_idx); |
| 4617 | update_sg_imb_stats(&sgi, load, nr_running); | ||
| 4618 | } | ||
| 4619 | 5494 | ||
| 4620 | sgs->group_load += load; | 5495 | sgs->group_load += load; |
| 4621 | sgs->sum_nr_running += nr_running; | 5496 | sgs->sum_nr_running += nr_running; |
| 5497 | #ifdef CONFIG_NUMA_BALANCING | ||
| 5498 | sgs->nr_numa_running += rq->nr_numa_running; | ||
| 5499 | sgs->nr_preferred_running += rq->nr_preferred_running; | ||
| 5500 | #endif | ||
| 4622 | sgs->sum_weighted_load += weighted_cpuload(i); | 5501 | sgs->sum_weighted_load += weighted_cpuload(i); |
| 4623 | if (idle_cpu(i)) | 5502 | if (idle_cpu(i)) |
| 4624 | sgs->idle_cpus++; | 5503 | sgs->idle_cpus++; |
| 4625 | } | 5504 | } |
| 4626 | 5505 | ||
| 4627 | if (local_group && (env->idle != CPU_NEWLY_IDLE || | ||
| 4628 | time_after_eq(jiffies, group->sgp->next_update))) | ||
| 4629 | update_group_power(env->sd, env->dst_cpu); | ||
| 4630 | |||
| 4631 | /* Adjust by relative CPU power of the group */ | 5506 | /* Adjust by relative CPU power of the group */ |
| 4632 | sgs->group_power = group->sgp->power; | 5507 | sgs->group_power = group->sgp->power; |
| 4633 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; | 5508 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; |
| @@ -4635,16 +5510,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 4635 | if (sgs->sum_nr_running) | 5510 | if (sgs->sum_nr_running) |
| 4636 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 5511 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
| 4637 | 5512 | ||
| 4638 | sgs->group_imb = sg_imbalanced(sgs, &sgi); | ||
| 4639 | |||
| 4640 | sgs->group_capacity = | ||
| 4641 | DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); | ||
| 4642 | |||
| 4643 | if (!sgs->group_capacity) | ||
| 4644 | sgs->group_capacity = fix_small_capacity(env->sd, group); | ||
| 4645 | |||
| 4646 | sgs->group_weight = group->group_weight; | 5513 | sgs->group_weight = group->group_weight; |
| 4647 | 5514 | ||
| 5515 | sgs->group_imb = sg_imbalanced(group); | ||
| 5516 | sgs->group_capacity = sg_capacity(env, group); | ||
| 5517 | |||
| 4648 | if (sgs->group_capacity > sgs->sum_nr_running) | 5518 | if (sgs->group_capacity > sgs->sum_nr_running) |
| 4649 | sgs->group_has_capacity = 1; | 5519 | sgs->group_has_capacity = 1; |
| 4650 | } | 5520 | } |
| @@ -4693,14 +5563,42 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
| 4693 | return false; | 5563 | return false; |
| 4694 | } | 5564 | } |
| 4695 | 5565 | ||
| 5566 | #ifdef CONFIG_NUMA_BALANCING | ||
| 5567 | static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) | ||
| 5568 | { | ||
| 5569 | if (sgs->sum_nr_running > sgs->nr_numa_running) | ||
| 5570 | return regular; | ||
| 5571 | if (sgs->sum_nr_running > sgs->nr_preferred_running) | ||
| 5572 | return remote; | ||
| 5573 | return all; | ||
| 5574 | } | ||
| 5575 | |||
| 5576 | static inline enum fbq_type fbq_classify_rq(struct rq *rq) | ||
| 5577 | { | ||
| 5578 | if (rq->nr_running > rq->nr_numa_running) | ||
| 5579 | return regular; | ||
| 5580 | if (rq->nr_running > rq->nr_preferred_running) | ||
| 5581 | return remote; | ||
| 5582 | return all; | ||
| 5583 | } | ||
| 5584 | #else | ||
| 5585 | static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) | ||
| 5586 | { | ||
| 5587 | return all; | ||
| 5588 | } | ||
| 5589 | |||
| 5590 | static inline enum fbq_type fbq_classify_rq(struct rq *rq) | ||
| 5591 | { | ||
| 5592 | return regular; | ||
| 5593 | } | ||
| 5594 | #endif /* CONFIG_NUMA_BALANCING */ | ||
| 5595 | |||
| 4696 | /** | 5596 | /** |
| 4697 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. | 5597 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. |
| 4698 | * @env: The load balancing environment. | 5598 | * @env: The load balancing environment. |
| 4699 | * @balance: Should we balance. | ||
| 4700 | * @sds: variable to hold the statistics for this sched_domain. | 5599 | * @sds: variable to hold the statistics for this sched_domain. |
| 4701 | */ | 5600 | */ |
| 4702 | static inline void update_sd_lb_stats(struct lb_env *env, | 5601 | static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) |
| 4703 | struct sd_lb_stats *sds) | ||
| 4704 | { | 5602 | { |
| 4705 | struct sched_domain *child = env->sd->child; | 5603 | struct sched_domain *child = env->sd->child; |
| 4706 | struct sched_group *sg = env->sd->groups; | 5604 | struct sched_group *sg = env->sd->groups; |
| @@ -4720,11 +5618,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
| 4720 | if (local_group) { | 5618 | if (local_group) { |
| 4721 | sds->local = sg; | 5619 | sds->local = sg; |
| 4722 | sgs = &sds->local_stat; | 5620 | sgs = &sds->local_stat; |
| 5621 | |||
| 5622 | if (env->idle != CPU_NEWLY_IDLE || | ||
| 5623 | time_after_eq(jiffies, sg->sgp->next_update)) | ||
| 5624 | update_group_power(env->sd, env->dst_cpu); | ||
| 4723 | } | 5625 | } |
| 4724 | 5626 | ||
| 4725 | memset(sgs, 0, sizeof(*sgs)); | ||
| 4726 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); | 5627 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); |
| 4727 | 5628 | ||
| 5629 | if (local_group) | ||
| 5630 | goto next_group; | ||
| 5631 | |||
| 4728 | /* | 5632 | /* |
| 4729 | * In case the child domain prefers tasks go to siblings | 5633 | * In case the child domain prefers tasks go to siblings |
| 4730 | * first, lower the sg capacity to one so that we'll try | 5634 | * first, lower the sg capacity to one so that we'll try |
| @@ -4735,21 +5639,25 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
| 4735 | * heaviest group when it is already under-utilized (possible | 5639 | * heaviest group when it is already under-utilized (possible |
| 4736 | * with a large weight task outweighs the tasks on the system). | 5640 | * with a large weight task outweighs the tasks on the system). |
| 4737 | */ | 5641 | */ |
| 4738 | if (prefer_sibling && !local_group && | 5642 | if (prefer_sibling && sds->local && |
| 4739 | sds->local && sds->local_stat.group_has_capacity) | 5643 | sds->local_stat.group_has_capacity) |
| 4740 | sgs->group_capacity = min(sgs->group_capacity, 1U); | 5644 | sgs->group_capacity = min(sgs->group_capacity, 1U); |
| 4741 | 5645 | ||
| 4742 | /* Now, start updating sd_lb_stats */ | 5646 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
| 4743 | sds->total_load += sgs->group_load; | ||
| 4744 | sds->total_pwr += sgs->group_power; | ||
| 4745 | |||
| 4746 | if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { | ||
| 4747 | sds->busiest = sg; | 5647 | sds->busiest = sg; |
| 4748 | sds->busiest_stat = *sgs; | 5648 | sds->busiest_stat = *sgs; |
| 4749 | } | 5649 | } |
| 4750 | 5650 | ||
| 5651 | next_group: | ||
| 5652 | /* Now, start updating sd_lb_stats */ | ||
| 5653 | sds->total_load += sgs->group_load; | ||
| 5654 | sds->total_pwr += sgs->group_power; | ||
| 5655 | |||
| 4751 | sg = sg->next; | 5656 | sg = sg->next; |
| 4752 | } while (sg != env->sd->groups); | 5657 | } while (sg != env->sd->groups); |
| 5658 | |||
| 5659 | if (env->sd->flags & SD_NUMA) | ||
| 5660 | env->fbq_type = fbq_classify_group(&sds->busiest_stat); | ||
| 4753 | } | 5661 | } |
| 4754 | 5662 | ||
| 4755 | /** | 5663 | /** |
| @@ -5053,15 +5961,39 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 5053 | int i; | 5961 | int i; |
| 5054 | 5962 | ||
| 5055 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 5963 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
| 5056 | unsigned long power = power_of(i); | 5964 | unsigned long power, capacity, wl; |
| 5057 | unsigned long capacity = DIV_ROUND_CLOSEST(power, | 5965 | enum fbq_type rt; |
| 5058 | SCHED_POWER_SCALE); | 5966 | |
| 5059 | unsigned long wl; | 5967 | rq = cpu_rq(i); |
| 5968 | rt = fbq_classify_rq(rq); | ||
| 5969 | |||
| 5970 | /* | ||
| 5971 | * We classify groups/runqueues into three groups: | ||
| 5972 | * - regular: there are !numa tasks | ||
| 5973 | * - remote: there are numa tasks that run on the 'wrong' node | ||
| 5974 | * - all: there is no distinction | ||
| 5975 | * | ||
| 5976 | * In order to avoid migrating ideally placed numa tasks, | ||
| 5977 | * ignore those when there's better options. | ||
| 5978 | * | ||
| 5979 | * If we ignore the actual busiest queue to migrate another | ||
| 5980 | * task, the next balance pass can still reduce the busiest | ||
| 5981 | * queue by moving tasks around inside the node. | ||
| 5982 | * | ||
| 5983 | * If we cannot move enough load due to this classification | ||
| 5984 | * the next pass will adjust the group classification and | ||
| 5985 | * allow migration of more tasks. | ||
| 5986 | * | ||
| 5987 | * Both cases only affect the total convergence complexity. | ||
| 5988 | */ | ||
| 5989 | if (rt > env->fbq_type) | ||
| 5990 | continue; | ||
| 5060 | 5991 | ||
| 5992 | power = power_of(i); | ||
| 5993 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | ||
| 5061 | if (!capacity) | 5994 | if (!capacity) |
| 5062 | capacity = fix_small_capacity(env->sd, group); | 5995 | capacity = fix_small_capacity(env->sd, group); |
| 5063 | 5996 | ||
| 5064 | rq = cpu_rq(i); | ||
| 5065 | wl = weighted_cpuload(i); | 5997 | wl = weighted_cpuload(i); |
| 5066 | 5998 | ||
| 5067 | /* | 5999 | /* |
| @@ -5164,6 +6096,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 5164 | int *continue_balancing) | 6096 | int *continue_balancing) |
| 5165 | { | 6097 | { |
| 5166 | int ld_moved, cur_ld_moved, active_balance = 0; | 6098 | int ld_moved, cur_ld_moved, active_balance = 0; |
| 6099 | struct sched_domain *sd_parent = sd->parent; | ||
| 5167 | struct sched_group *group; | 6100 | struct sched_group *group; |
| 5168 | struct rq *busiest; | 6101 | struct rq *busiest; |
| 5169 | unsigned long flags; | 6102 | unsigned long flags; |
| @@ -5177,6 +6110,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 5177 | .idle = idle, | 6110 | .idle = idle, |
| 5178 | .loop_break = sched_nr_migrate_break, | 6111 | .loop_break = sched_nr_migrate_break, |
| 5179 | .cpus = cpus, | 6112 | .cpus = cpus, |
| 6113 | .fbq_type = all, | ||
| 5180 | }; | 6114 | }; |
| 5181 | 6115 | ||
| 5182 | /* | 6116 | /* |
| @@ -5268,17 +6202,17 @@ more_balance: | |||
| 5268 | * moreover subsequent load balance cycles should correct the | 6202 | * moreover subsequent load balance cycles should correct the |
| 5269 | * excess load moved. | 6203 | * excess load moved. |
| 5270 | */ | 6204 | */ |
| 5271 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { | 6205 | if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { |
| 6206 | |||
| 6207 | /* Prevent to re-select dst_cpu via env's cpus */ | ||
| 6208 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | ||
| 5272 | 6209 | ||
| 5273 | env.dst_rq = cpu_rq(env.new_dst_cpu); | 6210 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
| 5274 | env.dst_cpu = env.new_dst_cpu; | 6211 | env.dst_cpu = env.new_dst_cpu; |
| 5275 | env.flags &= ~LBF_SOME_PINNED; | 6212 | env.flags &= ~LBF_DST_PINNED; |
| 5276 | env.loop = 0; | 6213 | env.loop = 0; |
| 5277 | env.loop_break = sched_nr_migrate_break; | 6214 | env.loop_break = sched_nr_migrate_break; |
| 5278 | 6215 | ||
| 5279 | /* Prevent to re-select dst_cpu via env's cpus */ | ||
| 5280 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | ||
| 5281 | |||
| 5282 | /* | 6216 | /* |
| 5283 | * Go back to "more_balance" rather than "redo" since we | 6217 | * Go back to "more_balance" rather than "redo" since we |
| 5284 | * need to continue with same src_cpu. | 6218 | * need to continue with same src_cpu. |
| @@ -5286,6 +6220,18 @@ more_balance: | |||
| 5286 | goto more_balance; | 6220 | goto more_balance; |
| 5287 | } | 6221 | } |
| 5288 | 6222 | ||
| 6223 | /* | ||
| 6224 | * We failed to reach balance because of affinity. | ||
| 6225 | */ | ||
| 6226 | if (sd_parent) { | ||
| 6227 | int *group_imbalance = &sd_parent->groups->sgp->imbalance; | ||
| 6228 | |||
| 6229 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { | ||
| 6230 | *group_imbalance = 1; | ||
| 6231 | } else if (*group_imbalance) | ||
| 6232 | *group_imbalance = 0; | ||
| 6233 | } | ||
| 6234 | |||
| 5289 | /* All tasks on this runqueue were pinned by CPU affinity */ | 6235 | /* All tasks on this runqueue were pinned by CPU affinity */ |
| 5290 | if (unlikely(env.flags & LBF_ALL_PINNED)) { | 6236 | if (unlikely(env.flags & LBF_ALL_PINNED)) { |
| 5291 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 6237 | cpumask_clear_cpu(cpu_of(busiest), cpus); |
| @@ -5393,6 +6339,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 5393 | struct sched_domain *sd; | 6339 | struct sched_domain *sd; |
| 5394 | int pulled_task = 0; | 6340 | int pulled_task = 0; |
| 5395 | unsigned long next_balance = jiffies + HZ; | 6341 | unsigned long next_balance = jiffies + HZ; |
| 6342 | u64 curr_cost = 0; | ||
| 5396 | 6343 | ||
| 5397 | this_rq->idle_stamp = rq_clock(this_rq); | 6344 | this_rq->idle_stamp = rq_clock(this_rq); |
| 5398 | 6345 | ||
| @@ -5409,15 +6356,27 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 5409 | for_each_domain(this_cpu, sd) { | 6356 | for_each_domain(this_cpu, sd) { |
| 5410 | unsigned long interval; | 6357 | unsigned long interval; |
| 5411 | int continue_balancing = 1; | 6358 | int continue_balancing = 1; |
| 6359 | u64 t0, domain_cost; | ||
| 5412 | 6360 | ||
| 5413 | if (!(sd->flags & SD_LOAD_BALANCE)) | 6361 | if (!(sd->flags & SD_LOAD_BALANCE)) |
| 5414 | continue; | 6362 | continue; |
| 5415 | 6363 | ||
| 6364 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) | ||
| 6365 | break; | ||
| 6366 | |||
| 5416 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 6367 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
| 6368 | t0 = sched_clock_cpu(this_cpu); | ||
| 6369 | |||
| 5417 | /* If we've pulled tasks over stop searching: */ | 6370 | /* If we've pulled tasks over stop searching: */ |
| 5418 | pulled_task = load_balance(this_cpu, this_rq, | 6371 | pulled_task = load_balance(this_cpu, this_rq, |
| 5419 | sd, CPU_NEWLY_IDLE, | 6372 | sd, CPU_NEWLY_IDLE, |
| 5420 | &continue_balancing); | 6373 | &continue_balancing); |
| 6374 | |||
| 6375 | domain_cost = sched_clock_cpu(this_cpu) - t0; | ||
| 6376 | if (domain_cost > sd->max_newidle_lb_cost) | ||
| 6377 | sd->max_newidle_lb_cost = domain_cost; | ||
| 6378 | |||
| 6379 | curr_cost += domain_cost; | ||
| 5421 | } | 6380 | } |
| 5422 | 6381 | ||
| 5423 | interval = msecs_to_jiffies(sd->balance_interval); | 6382 | interval = msecs_to_jiffies(sd->balance_interval); |
| @@ -5439,6 +6398,9 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 5439 | */ | 6398 | */ |
| 5440 | this_rq->next_balance = next_balance; | 6399 | this_rq->next_balance = next_balance; |
| 5441 | } | 6400 | } |
| 6401 | |||
| 6402 | if (curr_cost > this_rq->max_idle_balance_cost) | ||
| 6403 | this_rq->max_idle_balance_cost = curr_cost; | ||
| 5442 | } | 6404 | } |
| 5443 | 6405 | ||
| 5444 | /* | 6406 | /* |
| @@ -5572,16 +6534,16 @@ static inline void nohz_balance_exit_idle(int cpu) | |||
| 5572 | static inline void set_cpu_sd_state_busy(void) | 6534 | static inline void set_cpu_sd_state_busy(void) |
| 5573 | { | 6535 | { |
| 5574 | struct sched_domain *sd; | 6536 | struct sched_domain *sd; |
| 6537 | int cpu = smp_processor_id(); | ||
| 5575 | 6538 | ||
| 5576 | rcu_read_lock(); | 6539 | rcu_read_lock(); |
| 5577 | sd = rcu_dereference_check_sched_domain(this_rq()->sd); | 6540 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
| 5578 | 6541 | ||
| 5579 | if (!sd || !sd->nohz_idle) | 6542 | if (!sd || !sd->nohz_idle) |
| 5580 | goto unlock; | 6543 | goto unlock; |
| 5581 | sd->nohz_idle = 0; | 6544 | sd->nohz_idle = 0; |
| 5582 | 6545 | ||
| 5583 | for (; sd; sd = sd->parent) | 6546 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); |
| 5584 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); | ||
| 5585 | unlock: | 6547 | unlock: |
| 5586 | rcu_read_unlock(); | 6548 | rcu_read_unlock(); |
| 5587 | } | 6549 | } |
| @@ -5589,16 +6551,16 @@ unlock: | |||
| 5589 | void set_cpu_sd_state_idle(void) | 6551 | void set_cpu_sd_state_idle(void) |
| 5590 | { | 6552 | { |
| 5591 | struct sched_domain *sd; | 6553 | struct sched_domain *sd; |
| 6554 | int cpu = smp_processor_id(); | ||
| 5592 | 6555 | ||
| 5593 | rcu_read_lock(); | 6556 | rcu_read_lock(); |
| 5594 | sd = rcu_dereference_check_sched_domain(this_rq()->sd); | 6557 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
| 5595 | 6558 | ||
| 5596 | if (!sd || sd->nohz_idle) | 6559 | if (!sd || sd->nohz_idle) |
| 5597 | goto unlock; | 6560 | goto unlock; |
| 5598 | sd->nohz_idle = 1; | 6561 | sd->nohz_idle = 1; |
| 5599 | 6562 | ||
| 5600 | for (; sd; sd = sd->parent) | 6563 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); |
| 5601 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); | ||
| 5602 | unlock: | 6564 | unlock: |
| 5603 | rcu_read_unlock(); | 6565 | rcu_read_unlock(); |
| 5604 | } | 6566 | } |
| @@ -5662,15 +6624,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 5662 | /* Earliest time when we have to do rebalance again */ | 6624 | /* Earliest time when we have to do rebalance again */ |
| 5663 | unsigned long next_balance = jiffies + 60*HZ; | 6625 | unsigned long next_balance = jiffies + 60*HZ; |
| 5664 | int update_next_balance = 0; | 6626 | int update_next_balance = 0; |
| 5665 | int need_serialize; | 6627 | int need_serialize, need_decay = 0; |
| 6628 | u64 max_cost = 0; | ||
| 5666 | 6629 | ||
| 5667 | update_blocked_averages(cpu); | 6630 | update_blocked_averages(cpu); |
| 5668 | 6631 | ||
| 5669 | rcu_read_lock(); | 6632 | rcu_read_lock(); |
| 5670 | for_each_domain(cpu, sd) { | 6633 | for_each_domain(cpu, sd) { |
| 6634 | /* | ||
| 6635 | * Decay the newidle max times here because this is a regular | ||
| 6636 | * visit to all the domains. Decay ~1% per second. | ||
| 6637 | */ | ||
| 6638 | if (time_after(jiffies, sd->next_decay_max_lb_cost)) { | ||
| 6639 | sd->max_newidle_lb_cost = | ||
| 6640 | (sd->max_newidle_lb_cost * 253) / 256; | ||
| 6641 | sd->next_decay_max_lb_cost = jiffies + HZ; | ||
| 6642 | need_decay = 1; | ||
| 6643 | } | ||
| 6644 | max_cost += sd->max_newidle_lb_cost; | ||
| 6645 | |||
| 5671 | if (!(sd->flags & SD_LOAD_BALANCE)) | 6646 | if (!(sd->flags & SD_LOAD_BALANCE)) |
| 5672 | continue; | 6647 | continue; |
| 5673 | 6648 | ||
| 6649 | /* | ||
| 6650 | * Stop the load balance at this level. There is another | ||
| 6651 | * CPU in our sched group which is doing load balancing more | ||
| 6652 | * actively. | ||
| 6653 | */ | ||
| 6654 | if (!continue_balancing) { | ||
| 6655 | if (need_decay) | ||
| 6656 | continue; | ||
| 6657 | break; | ||
| 6658 | } | ||
| 6659 | |||
| 5674 | interval = sd->balance_interval; | 6660 | interval = sd->balance_interval; |
| 5675 | if (idle != CPU_IDLE) | 6661 | if (idle != CPU_IDLE) |
| 5676 | interval *= sd->busy_factor; | 6662 | interval *= sd->busy_factor; |
| @@ -5689,7 +6675,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 5689 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 6675 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
| 5690 | if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { | 6676 | if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { |
| 5691 | /* | 6677 | /* |
| 5692 | * The LBF_SOME_PINNED logic could have changed | 6678 | * The LBF_DST_PINNED logic could have changed |
| 5693 | * env->dst_cpu, so we can't know our idle | 6679 | * env->dst_cpu, so we can't know our idle |
| 5694 | * state even if we migrated tasks. Update it. | 6680 | * state even if we migrated tasks. Update it. |
| 5695 | */ | 6681 | */ |
| @@ -5704,14 +6690,14 @@ out: | |||
| 5704 | next_balance = sd->last_balance + interval; | 6690 | next_balance = sd->last_balance + interval; |
| 5705 | update_next_balance = 1; | 6691 | update_next_balance = 1; |
| 5706 | } | 6692 | } |
| 5707 | 6693 | } | |
| 6694 | if (need_decay) { | ||
| 5708 | /* | 6695 | /* |
| 5709 | * Stop the load balance at this level. There is another | 6696 | * Ensure the rq-wide value also decays but keep it at a |
| 5710 | * CPU in our sched group which is doing load balancing more | 6697 | * reasonable floor to avoid funnies with rq->avg_idle. |
| 5711 | * actively. | ||
| 5712 | */ | 6698 | */ |
| 5713 | if (!continue_balancing) | 6699 | rq->max_idle_balance_cost = |
| 5714 | break; | 6700 | max((u64)sysctl_sched_migration_cost, max_cost); |
| 5715 | } | 6701 | } |
| 5716 | rcu_read_unlock(); | 6702 | rcu_read_unlock(); |
| 5717 | 6703 | ||
| @@ -5781,6 +6767,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
| 5781 | { | 6767 | { |
| 5782 | unsigned long now = jiffies; | 6768 | unsigned long now = jiffies; |
| 5783 | struct sched_domain *sd; | 6769 | struct sched_domain *sd; |
| 6770 | struct sched_group_power *sgp; | ||
| 6771 | int nr_busy; | ||
| 5784 | 6772 | ||
| 5785 | if (unlikely(idle_cpu(cpu))) | 6773 | if (unlikely(idle_cpu(cpu))) |
| 5786 | return 0; | 6774 | return 0; |
| @@ -5806,22 +6794,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
| 5806 | goto need_kick; | 6794 | goto need_kick; |
| 5807 | 6795 | ||
| 5808 | rcu_read_lock(); | 6796 | rcu_read_lock(); |
| 5809 | for_each_domain(cpu, sd) { | 6797 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
| 5810 | struct sched_group *sg = sd->groups; | ||
| 5811 | struct sched_group_power *sgp = sg->sgp; | ||
| 5812 | int nr_busy = atomic_read(&sgp->nr_busy_cpus); | ||
| 5813 | 6798 | ||
| 5814 | if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) | 6799 | if (sd) { |
| 5815 | goto need_kick_unlock; | 6800 | sgp = sd->groups->sgp; |
| 6801 | nr_busy = atomic_read(&sgp->nr_busy_cpus); | ||
| 5816 | 6802 | ||
| 5817 | if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight | 6803 | if (nr_busy > 1) |
| 5818 | && (cpumask_first_and(nohz.idle_cpus_mask, | ||
| 5819 | sched_domain_span(sd)) < cpu)) | ||
| 5820 | goto need_kick_unlock; | 6804 | goto need_kick_unlock; |
| 5821 | |||
| 5822 | if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) | ||
| 5823 | break; | ||
| 5824 | } | 6805 | } |
| 6806 | |||
| 6807 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | ||
| 6808 | |||
| 6809 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, | ||
| 6810 | sched_domain_span(sd)) < cpu)) | ||
| 6811 | goto need_kick_unlock; | ||
| 6812 | |||
| 5825 | rcu_read_unlock(); | 6813 | rcu_read_unlock(); |
| 5826 | return 0; | 6814 | return 0; |
| 5827 | 6815 | ||
| @@ -6214,7 +7202,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
| 6214 | se->cfs_rq = parent->my_q; | 7202 | se->cfs_rq = parent->my_q; |
| 6215 | 7203 | ||
| 6216 | se->my_q = cfs_rq; | 7204 | se->my_q = cfs_rq; |
| 6217 | update_load_set(&se->load, 0); | 7205 | /* guarantee group entities always have weight */ |
| 7206 | update_load_set(&se->load, NICE_0_LOAD); | ||
| 6218 | se->parent = parent; | 7207 | se->parent = parent; |
| 6219 | } | 7208 | } |
| 6220 | 7209 | ||
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 99399f8e4799..5716929a2e3a 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
| @@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false) | |||
| 63 | /* | 63 | /* |
| 64 | * Apply the automatic NUMA scheduling policy. Enabled automatically | 64 | * Apply the automatic NUMA scheduling policy. Enabled automatically |
| 65 | * at runtime if running on a NUMA machine. Can be controlled via | 65 | * at runtime if running on a NUMA machine. Can be controlled via |
| 66 | * numa_balancing=. Allow PTE scanning to be forced on UMA machines | 66 | * numa_balancing= |
| 67 | * for debugging the core machinery. | ||
| 68 | */ | 67 | */ |
| 69 | #ifdef CONFIG_NUMA_BALANCING | 68 | #ifdef CONFIG_NUMA_BALANCING |
| 70 | SCHED_FEAT(NUMA, false) | 69 | SCHED_FEAT(NUMA, false) |
| 71 | SCHED_FEAT(NUMA_FORCE, false) | 70 | |
| 71 | /* | ||
| 72 | * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a | ||
| 73 | * higher number of hinting faults are recorded during active load | ||
| 74 | * balancing. | ||
| 75 | */ | ||
| 76 | SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) | ||
| 77 | |||
| 78 | /* | ||
| 79 | * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a | ||
| 80 | * lower number of hinting faults have been recorded. As this has | ||
| 81 | * the potential to prevent a task ever migrating to a new node | ||
| 82 | * due to CPU overload it is disabled by default. | ||
| 83 | */ | ||
| 84 | SCHED_FEAT(NUMA_RESIST_LOWER, false) | ||
| 72 | #endif | 85 | #endif |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index d8da01008d39..516c3d9ceea1 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
| @@ -9,7 +9,7 @@ | |||
| 9 | 9 | ||
| 10 | #ifdef CONFIG_SMP | 10 | #ifdef CONFIG_SMP |
| 11 | static int | 11 | static int |
| 12 | select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) | 12 | select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) |
| 13 | { | 13 | { |
| 14 | return task_cpu(p); /* IDLE tasks as never migrated */ | 14 | return task_cpu(p); /* IDLE tasks as never migrated */ |
| 15 | } | 15 | } |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 01970c8e64df..7d57275fc396 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq) | |||
| 246 | * if we should look at the mask. It would be a shame | 246 | * if we should look at the mask. It would be a shame |
| 247 | * if we looked at the mask, but the mask was not | 247 | * if we looked at the mask, but the mask was not |
| 248 | * updated yet. | 248 | * updated yet. |
| 249 | * | ||
| 250 | * Matched by the barrier in pull_rt_task(). | ||
| 249 | */ | 251 | */ |
| 250 | wmb(); | 252 | smp_wmb(); |
| 251 | atomic_inc(&rq->rd->rto_count); | 253 | atomic_inc(&rq->rd->rto_count); |
| 252 | } | 254 | } |
| 253 | 255 | ||
| @@ -1169,13 +1171,10 @@ static void yield_task_rt(struct rq *rq) | |||
| 1169 | static int find_lowest_rq(struct task_struct *task); | 1171 | static int find_lowest_rq(struct task_struct *task); |
| 1170 | 1172 | ||
| 1171 | static int | 1173 | static int |
| 1172 | select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | 1174 | select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) |
| 1173 | { | 1175 | { |
| 1174 | struct task_struct *curr; | 1176 | struct task_struct *curr; |
| 1175 | struct rq *rq; | 1177 | struct rq *rq; |
| 1176 | int cpu; | ||
| 1177 | |||
| 1178 | cpu = task_cpu(p); | ||
| 1179 | 1178 | ||
| 1180 | if (p->nr_cpus_allowed == 1) | 1179 | if (p->nr_cpus_allowed == 1) |
| 1181 | goto out; | 1180 | goto out; |
| @@ -1213,8 +1212,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
| 1213 | */ | 1212 | */ |
| 1214 | if (curr && unlikely(rt_task(curr)) && | 1213 | if (curr && unlikely(rt_task(curr)) && |
| 1215 | (curr->nr_cpus_allowed < 2 || | 1214 | (curr->nr_cpus_allowed < 2 || |
| 1216 | curr->prio <= p->prio) && | 1215 | curr->prio <= p->prio)) { |
| 1217 | (p->nr_cpus_allowed > 1)) { | ||
| 1218 | int target = find_lowest_rq(p); | 1216 | int target = find_lowest_rq(p); |
| 1219 | 1217 | ||
| 1220 | if (target != -1) | 1218 | if (target != -1) |
| @@ -1630,6 +1628,12 @@ static int pull_rt_task(struct rq *this_rq) | |||
| 1630 | if (likely(!rt_overloaded(this_rq))) | 1628 | if (likely(!rt_overloaded(this_rq))) |
| 1631 | return 0; | 1629 | return 0; |
| 1632 | 1630 | ||
| 1631 | /* | ||
| 1632 | * Match the barrier from rt_set_overloaded; this guarantees that if we | ||
| 1633 | * see overloaded we must also see the rto_mask bit. | ||
| 1634 | */ | ||
| 1635 | smp_rmb(); | ||
| 1636 | |||
| 1633 | for_each_cpu(cpu, this_rq->rd->rto_mask) { | 1637 | for_each_cpu(cpu, this_rq->rd->rto_mask) { |
| 1634 | if (this_cpu == cpu) | 1638 | if (this_cpu == cpu) |
| 1635 | continue; | 1639 | continue; |
| @@ -1931,8 +1935,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
| 1931 | p->rt.time_slice = sched_rr_timeslice; | 1935 | p->rt.time_slice = sched_rr_timeslice; |
| 1932 | 1936 | ||
| 1933 | /* | 1937 | /* |
| 1934 | * Requeue to the end of queue if we (and all of our ancestors) are the | 1938 | * Requeue to the end of queue if we (and all of our ancestors) are not |
| 1935 | * only element on the queue | 1939 | * the only element on the queue |
| 1936 | */ | 1940 | */ |
| 1937 | for_each_sched_rt_entity(rt_se) { | 1941 | for_each_sched_rt_entity(rt_se) { |
| 1938 | if (rt_se->run_list.prev != rt_se->run_list.next) { | 1942 | if (rt_se->run_list.prev != rt_se->run_list.next) { |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3c5653e1dca..88c85b21d633 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <linux/spinlock.h> | 6 | #include <linux/spinlock.h> |
| 7 | #include <linux/stop_machine.h> | 7 | #include <linux/stop_machine.h> |
| 8 | #include <linux/tick.h> | 8 | #include <linux/tick.h> |
| 9 | #include <linux/slab.h> | ||
| 9 | 10 | ||
| 10 | #include "cpupri.h" | 11 | #include "cpupri.h" |
| 11 | #include "cpuacct.h" | 12 | #include "cpuacct.h" |
| @@ -408,6 +409,10 @@ struct rq { | |||
| 408 | * remote CPUs use both these fields when doing load calculation. | 409 | * remote CPUs use both these fields when doing load calculation. |
| 409 | */ | 410 | */ |
| 410 | unsigned int nr_running; | 411 | unsigned int nr_running; |
| 412 | #ifdef CONFIG_NUMA_BALANCING | ||
| 413 | unsigned int nr_numa_running; | ||
| 414 | unsigned int nr_preferred_running; | ||
| 415 | #endif | ||
| 411 | #define CPU_LOAD_IDX_MAX 5 | 416 | #define CPU_LOAD_IDX_MAX 5 |
| 412 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 417 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
| 413 | unsigned long last_load_update_tick; | 418 | unsigned long last_load_update_tick; |
| @@ -476,6 +481,9 @@ struct rq { | |||
| 476 | u64 age_stamp; | 481 | u64 age_stamp; |
| 477 | u64 idle_stamp; | 482 | u64 idle_stamp; |
| 478 | u64 avg_idle; | 483 | u64 avg_idle; |
| 484 | |||
| 485 | /* This is used to determine avg_idle's max value */ | ||
| 486 | u64 max_idle_balance_cost; | ||
| 479 | #endif | 487 | #endif |
| 480 | 488 | ||
| 481 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 489 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
| @@ -552,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq) | |||
| 552 | return rq->clock_task; | 560 | return rq->clock_task; |
| 553 | } | 561 | } |
| 554 | 562 | ||
| 563 | #ifdef CONFIG_NUMA_BALANCING | ||
| 564 | extern void sched_setnuma(struct task_struct *p, int node); | ||
| 565 | extern int migrate_task_to(struct task_struct *p, int cpu); | ||
| 566 | extern int migrate_swap(struct task_struct *, struct task_struct *); | ||
| 567 | #endif /* CONFIG_NUMA_BALANCING */ | ||
| 568 | |||
| 555 | #ifdef CONFIG_SMP | 569 | #ifdef CONFIG_SMP |
| 556 | 570 | ||
| 557 | #define rcu_dereference_check_sched_domain(p) \ | 571 | #define rcu_dereference_check_sched_domain(p) \ |
| @@ -593,9 +607,24 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | |||
| 593 | return hsd; | 607 | return hsd; |
| 594 | } | 608 | } |
| 595 | 609 | ||
| 610 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
| 611 | { | ||
| 612 | struct sched_domain *sd; | ||
| 613 | |||
| 614 | for_each_domain(cpu, sd) { | ||
| 615 | if (sd->flags & flag) | ||
| 616 | break; | ||
| 617 | } | ||
| 618 | |||
| 619 | return sd; | ||
| 620 | } | ||
| 621 | |||
| 596 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 622 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
| 597 | DECLARE_PER_CPU(int, sd_llc_size); | 623 | DECLARE_PER_CPU(int, sd_llc_size); |
| 598 | DECLARE_PER_CPU(int, sd_llc_id); | 624 | DECLARE_PER_CPU(int, sd_llc_id); |
| 625 | DECLARE_PER_CPU(struct sched_domain *, sd_numa); | ||
| 626 | DECLARE_PER_CPU(struct sched_domain *, sd_busy); | ||
| 627 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); | ||
| 599 | 628 | ||
| 600 | struct sched_group_power { | 629 | struct sched_group_power { |
| 601 | atomic_t ref; | 630 | atomic_t ref; |
| @@ -605,6 +634,7 @@ struct sched_group_power { | |||
| 605 | */ | 634 | */ |
| 606 | unsigned int power, power_orig; | 635 | unsigned int power, power_orig; |
| 607 | unsigned long next_update; | 636 | unsigned long next_update; |
| 637 | int imbalance; /* XXX unrelated to power but shared group state */ | ||
| 608 | /* | 638 | /* |
| 609 | * Number of busy cpus in this group. | 639 | * Number of busy cpus in this group. |
| 610 | */ | 640 | */ |
| @@ -719,6 +749,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
| 719 | */ | 749 | */ |
| 720 | smp_wmb(); | 750 | smp_wmb(); |
| 721 | task_thread_info(p)->cpu = cpu; | 751 | task_thread_info(p)->cpu = cpu; |
| 752 | p->wake_cpu = cpu; | ||
| 722 | #endif | 753 | #endif |
| 723 | } | 754 | } |
| 724 | 755 | ||
| @@ -974,7 +1005,7 @@ struct sched_class { | |||
| 974 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | 1005 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); |
| 975 | 1006 | ||
| 976 | #ifdef CONFIG_SMP | 1007 | #ifdef CONFIG_SMP |
| 977 | int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); | 1008 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); |
| 978 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); | 1009 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); |
| 979 | 1010 | ||
| 980 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); | 1011 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); |
| @@ -1220,6 +1251,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | |||
| 1220 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | 1251 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); |
| 1221 | } | 1252 | } |
| 1222 | 1253 | ||
| 1254 | static inline void double_lock(spinlock_t *l1, spinlock_t *l2) | ||
| 1255 | { | ||
| 1256 | if (l1 > l2) | ||
| 1257 | swap(l1, l2); | ||
| 1258 | |||
| 1259 | spin_lock(l1); | ||
| 1260 | spin_lock_nested(l2, SINGLE_DEPTH_NESTING); | ||
| 1261 | } | ||
| 1262 | |||
| 1263 | static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) | ||
| 1264 | { | ||
| 1265 | if (l1 > l2) | ||
| 1266 | swap(l1, l2); | ||
| 1267 | |||
| 1268 | raw_spin_lock(l1); | ||
| 1269 | raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); | ||
| 1270 | } | ||
| 1271 | |||
| 1223 | /* | 1272 | /* |
| 1224 | * double_rq_lock - safely lock two runqueues | 1273 | * double_rq_lock - safely lock two runqueues |
| 1225 | * | 1274 | * |
| @@ -1305,7 +1354,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu); | |||
| 1305 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1354 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
| 1306 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1355 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); |
| 1307 | 1356 | ||
| 1308 | extern void account_cfs_bandwidth_used(int enabled, int was_enabled); | 1357 | extern void cfs_bandwidth_usage_inc(void); |
| 1358 | extern void cfs_bandwidth_usage_dec(void); | ||
| 1309 | 1359 | ||
| 1310 | #ifdef CONFIG_NO_HZ_COMMON | 1360 | #ifdef CONFIG_NO_HZ_COMMON |
| 1311 | enum rq_nohz_flag_bits { | 1361 | enum rq_nohz_flag_bits { |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c7edee71bce8..4ab704339656 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
| @@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) | |||
| 59 | * from dequeue_task() to account for possible rq->clock skew across cpus. The | 59 | * from dequeue_task() to account for possible rq->clock skew across cpus. The |
| 60 | * delta taken on each cpu would annul the skew. | 60 | * delta taken on each cpu would annul the skew. |
| 61 | */ | 61 | */ |
| 62 | static inline void sched_info_dequeued(struct task_struct *t) | 62 | static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) |
| 63 | { | 63 | { |
| 64 | unsigned long long now = rq_clock(task_rq(t)), delta = 0; | 64 | unsigned long long now = rq_clock(rq), delta = 0; |
| 65 | 65 | ||
| 66 | if (unlikely(sched_info_on())) | 66 | if (unlikely(sched_info_on())) |
| 67 | if (t->sched_info.last_queued) | 67 | if (t->sched_info.last_queued) |
| @@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t) | |||
| 69 | sched_info_reset_dequeued(t); | 69 | sched_info_reset_dequeued(t); |
| 70 | t->sched_info.run_delay += delta; | 70 | t->sched_info.run_delay += delta; |
| 71 | 71 | ||
| 72 | rq_sched_info_dequeued(task_rq(t), delta); | 72 | rq_sched_info_dequeued(rq, delta); |
| 73 | } | 73 | } |
| 74 | 74 | ||
| 75 | /* | 75 | /* |
| @@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t) | |||
| 77 | * long it was waiting to run. We also note when it began so that we | 77 | * long it was waiting to run. We also note when it began so that we |
| 78 | * can keep stats on how long its timeslice is. | 78 | * can keep stats on how long its timeslice is. |
| 79 | */ | 79 | */ |
| 80 | static void sched_info_arrive(struct task_struct *t) | 80 | static void sched_info_arrive(struct rq *rq, struct task_struct *t) |
| 81 | { | 81 | { |
| 82 | unsigned long long now = rq_clock(task_rq(t)), delta = 0; | 82 | unsigned long long now = rq_clock(rq), delta = 0; |
| 83 | 83 | ||
| 84 | if (t->sched_info.last_queued) | 84 | if (t->sched_info.last_queued) |
| 85 | delta = now - t->sched_info.last_queued; | 85 | delta = now - t->sched_info.last_queued; |
| @@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t) | |||
| 88 | t->sched_info.last_arrival = now; | 88 | t->sched_info.last_arrival = now; |
| 89 | t->sched_info.pcount++; | 89 | t->sched_info.pcount++; |
| 90 | 90 | ||
| 91 | rq_sched_info_arrive(task_rq(t), delta); | 91 | rq_sched_info_arrive(rq, delta); |
| 92 | } | 92 | } |
| 93 | 93 | ||
| 94 | /* | 94 | /* |
| @@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t) | |||
| 96 | * the timestamp if it is already not set. It's assumed that | 96 | * the timestamp if it is already not set. It's assumed that |
| 97 | * sched_info_dequeued() will clear that stamp when appropriate. | 97 | * sched_info_dequeued() will clear that stamp when appropriate. |
| 98 | */ | 98 | */ |
| 99 | static inline void sched_info_queued(struct task_struct *t) | 99 | static inline void sched_info_queued(struct rq *rq, struct task_struct *t) |
| 100 | { | 100 | { |
| 101 | if (unlikely(sched_info_on())) | 101 | if (unlikely(sched_info_on())) |
| 102 | if (!t->sched_info.last_queued) | 102 | if (!t->sched_info.last_queued) |
| 103 | t->sched_info.last_queued = rq_clock(task_rq(t)); | 103 | t->sched_info.last_queued = rq_clock(rq); |
| 104 | } | 104 | } |
| 105 | 105 | ||
| 106 | /* | 106 | /* |
| @@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t) | |||
| 111 | * sched_info_queued() to mark that it has now again started waiting on | 111 | * sched_info_queued() to mark that it has now again started waiting on |
| 112 | * the runqueue. | 112 | * the runqueue. |
| 113 | */ | 113 | */ |
| 114 | static inline void sched_info_depart(struct task_struct *t) | 114 | static inline void sched_info_depart(struct rq *rq, struct task_struct *t) |
| 115 | { | 115 | { |
| 116 | unsigned long long delta = rq_clock(task_rq(t)) - | 116 | unsigned long long delta = rq_clock(rq) - |
| 117 | t->sched_info.last_arrival; | 117 | t->sched_info.last_arrival; |
| 118 | 118 | ||
| 119 | rq_sched_info_depart(task_rq(t), delta); | 119 | rq_sched_info_depart(rq, delta); |
| 120 | 120 | ||
| 121 | if (t->state == TASK_RUNNING) | 121 | if (t->state == TASK_RUNNING) |
| 122 | sched_info_queued(t); | 122 | sched_info_queued(rq, t); |
| 123 | } | 123 | } |
| 124 | 124 | ||
| 125 | /* | 125 | /* |
| @@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t) | |||
| 128 | * the idle task.) We are only called when prev != next. | 128 | * the idle task.) We are only called when prev != next. |
| 129 | */ | 129 | */ |
| 130 | static inline void | 130 | static inline void |
| 131 | __sched_info_switch(struct task_struct *prev, struct task_struct *next) | 131 | __sched_info_switch(struct rq *rq, |
| 132 | struct task_struct *prev, struct task_struct *next) | ||
| 132 | { | 133 | { |
| 133 | struct rq *rq = task_rq(prev); | ||
| 134 | |||
| 135 | /* | 134 | /* |
| 136 | * prev now departs the cpu. It's not interesting to record | 135 | * prev now departs the cpu. It's not interesting to record |
| 137 | * stats about how efficient we were at scheduling the idle | 136 | * stats about how efficient we were at scheduling the idle |
| 138 | * process, however. | 137 | * process, however. |
| 139 | */ | 138 | */ |
| 140 | if (prev != rq->idle) | 139 | if (prev != rq->idle) |
| 141 | sched_info_depart(prev); | 140 | sched_info_depart(rq, prev); |
| 142 | 141 | ||
| 143 | if (next != rq->idle) | 142 | if (next != rq->idle) |
| 144 | sched_info_arrive(next); | 143 | sched_info_arrive(rq, next); |
| 145 | } | 144 | } |
| 146 | static inline void | 145 | static inline void |
| 147 | sched_info_switch(struct task_struct *prev, struct task_struct *next) | 146 | sched_info_switch(struct rq *rq, |
| 147 | struct task_struct *prev, struct task_struct *next) | ||
| 148 | { | 148 | { |
| 149 | if (unlikely(sched_info_on())) | 149 | if (unlikely(sched_info_on())) |
| 150 | __sched_info_switch(prev, next); | 150 | __sched_info_switch(rq, prev, next); |
| 151 | } | 151 | } |
| 152 | #else | 152 | #else |
| 153 | #define sched_info_queued(t) do { } while (0) | 153 | #define sched_info_queued(rq, t) do { } while (0) |
| 154 | #define sched_info_reset_dequeued(t) do { } while (0) | 154 | #define sched_info_reset_dequeued(t) do { } while (0) |
| 155 | #define sched_info_dequeued(t) do { } while (0) | 155 | #define sched_info_dequeued(rq, t) do { } while (0) |
| 156 | #define sched_info_switch(t, next) do { } while (0) | 156 | #define sched_info_depart(rq, t) do { } while (0) |
| 157 | #define sched_info_arrive(rq, next) do { } while (0) | ||
| 158 | #define sched_info_switch(rq, t, next) do { } while (0) | ||
| 157 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ | 159 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ |
| 158 | 160 | ||
| 159 | /* | 161 | /* |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index e08fbeeb54b9..47197de8abd9 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
| @@ -11,7 +11,7 @@ | |||
| 11 | 11 | ||
| 12 | #ifdef CONFIG_SMP | 12 | #ifdef CONFIG_SMP |
| 13 | static int | 13 | static int |
| 14 | select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) | 14 | select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) |
| 15 | { | 15 | { |
| 16 | return task_cpu(p); /* stop tasks as never migrate */ | 16 | return task_cpu(p); /* stop tasks as never migrate */ |
| 17 | } | 17 | } |
diff --git a/kernel/wait.c b/kernel/sched/wait.c index d550920e040c..7d50f794e248 100644 --- a/kernel/wait.c +++ b/kernel/sched/wait.c | |||
| @@ -53,6 +53,109 @@ EXPORT_SYMBOL(remove_wait_queue); | |||
| 53 | 53 | ||
| 54 | 54 | ||
| 55 | /* | 55 | /* |
| 56 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just | ||
| 57 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve | ||
| 58 | * number) then we wake all the non-exclusive tasks and one exclusive task. | ||
| 59 | * | ||
| 60 | * There are circumstances in which we can try to wake a task which has already | ||
| 61 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | ||
| 62 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | ||
| 63 | */ | ||
| 64 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | ||
| 65 | int nr_exclusive, int wake_flags, void *key) | ||
| 66 | { | ||
| 67 | wait_queue_t *curr, *next; | ||
| 68 | |||
| 69 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { | ||
| 70 | unsigned flags = curr->flags; | ||
| 71 | |||
| 72 | if (curr->func(curr, mode, wake_flags, key) && | ||
| 73 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | ||
| 74 | break; | ||
| 75 | } | ||
| 76 | } | ||
| 77 | |||
| 78 | /** | ||
| 79 | * __wake_up - wake up threads blocked on a waitqueue. | ||
| 80 | * @q: the waitqueue | ||
| 81 | * @mode: which threads | ||
| 82 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | ||
| 83 | * @key: is directly passed to the wakeup function | ||
| 84 | * | ||
| 85 | * It may be assumed that this function implies a write memory barrier before | ||
| 86 | * changing the task state if and only if any tasks are woken up. | ||
| 87 | */ | ||
| 88 | void __wake_up(wait_queue_head_t *q, unsigned int mode, | ||
| 89 | int nr_exclusive, void *key) | ||
| 90 | { | ||
| 91 | unsigned long flags; | ||
| 92 | |||
| 93 | spin_lock_irqsave(&q->lock, flags); | ||
| 94 | __wake_up_common(q, mode, nr_exclusive, 0, key); | ||
| 95 | spin_unlock_irqrestore(&q->lock, flags); | ||
| 96 | } | ||
| 97 | EXPORT_SYMBOL(__wake_up); | ||
| 98 | |||
| 99 | /* | ||
| 100 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. | ||
| 101 | */ | ||
| 102 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) | ||
| 103 | { | ||
| 104 | __wake_up_common(q, mode, nr, 0, NULL); | ||
| 105 | } | ||
| 106 | EXPORT_SYMBOL_GPL(__wake_up_locked); | ||
| 107 | |||
| 108 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | ||
| 109 | { | ||
| 110 | __wake_up_common(q, mode, 1, 0, key); | ||
| 111 | } | ||
| 112 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | ||
| 113 | |||
| 114 | /** | ||
| 115 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. | ||
| 116 | * @q: the waitqueue | ||
| 117 | * @mode: which threads | ||
| 118 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | ||
| 119 | * @key: opaque value to be passed to wakeup targets | ||
| 120 | * | ||
| 121 | * The sync wakeup differs that the waker knows that it will schedule | ||
| 122 | * away soon, so while the target thread will be woken up, it will not | ||
| 123 | * be migrated to another CPU - ie. the two threads are 'synchronized' | ||
| 124 | * with each other. This can prevent needless bouncing between CPUs. | ||
| 125 | * | ||
| 126 | * On UP it can prevent extra preemption. | ||
| 127 | * | ||
| 128 | * It may be assumed that this function implies a write memory barrier before | ||
| 129 | * changing the task state if and only if any tasks are woken up. | ||
| 130 | */ | ||
| 131 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | ||
| 132 | int nr_exclusive, void *key) | ||
| 133 | { | ||
| 134 | unsigned long flags; | ||
| 135 | int wake_flags = 1; /* XXX WF_SYNC */ | ||
| 136 | |||
| 137 | if (unlikely(!q)) | ||
| 138 | return; | ||
| 139 | |||
| 140 | if (unlikely(nr_exclusive != 1)) | ||
| 141 | wake_flags = 0; | ||
| 142 | |||
| 143 | spin_lock_irqsave(&q->lock, flags); | ||
| 144 | __wake_up_common(q, mode, nr_exclusive, wake_flags, key); | ||
| 145 | spin_unlock_irqrestore(&q->lock, flags); | ||
| 146 | } | ||
| 147 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); | ||
| 148 | |||
| 149 | /* | ||
| 150 | * __wake_up_sync - see __wake_up_sync_key() | ||
| 151 | */ | ||
| 152 | void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | ||
| 153 | { | ||
| 154 | __wake_up_sync_key(q, mode, nr_exclusive, NULL); | ||
| 155 | } | ||
| 156 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | ||
| 157 | |||
| 158 | /* | ||
| 56 | * Note: we use "set_current_state()" _after_ the wait-queue add, | 159 | * Note: we use "set_current_state()" _after_ the wait-queue add, |
| 57 | * because we need a memory barrier there on SMP, so that any | 160 | * because we need a memory barrier there on SMP, so that any |
| 58 | * wake-function that tests for the wait-queue being active | 161 | * wake-function that tests for the wait-queue being active |
| @@ -92,6 +195,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) | |||
| 92 | } | 195 | } |
| 93 | EXPORT_SYMBOL(prepare_to_wait_exclusive); | 196 | EXPORT_SYMBOL(prepare_to_wait_exclusive); |
| 94 | 197 | ||
| 198 | long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) | ||
| 199 | { | ||
| 200 | unsigned long flags; | ||
| 201 | |||
| 202 | if (signal_pending_state(state, current)) | ||
| 203 | return -ERESTARTSYS; | ||
| 204 | |||
| 205 | wait->private = current; | ||
| 206 | wait->func = autoremove_wake_function; | ||
| 207 | |||
| 208 | spin_lock_irqsave(&q->lock, flags); | ||
| 209 | if (list_empty(&wait->task_list)) { | ||
| 210 | if (wait->flags & WQ_FLAG_EXCLUSIVE) | ||
| 211 | __add_wait_queue_tail(q, wait); | ||
| 212 | else | ||
| 213 | __add_wait_queue(q, wait); | ||
| 214 | } | ||
| 215 | set_current_state(state); | ||
| 216 | spin_unlock_irqrestore(&q->lock, flags); | ||
| 217 | |||
| 218 | return 0; | ||
| 219 | } | ||
| 220 | EXPORT_SYMBOL(prepare_to_wait_event); | ||
| 221 | |||
| 95 | /** | 222 | /** |
| 96 | * finish_wait - clean up after waiting in a queue | 223 | * finish_wait - clean up after waiting in a queue |
| 97 | * @q: waitqueue waited on | 224 | * @q: waitqueue waited on |
diff --git a/kernel/smp.c b/kernel/smp.c index 0564571dcdf7..f5768b0c816a 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -524,6 +524,11 @@ void __init setup_nr_cpu_ids(void) | |||
| 524 | nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; | 524 | nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; |
| 525 | } | 525 | } |
| 526 | 526 | ||
| 527 | void __weak smp_announce(void) | ||
| 528 | { | ||
| 529 | printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus()); | ||
| 530 | } | ||
| 531 | |||
| 527 | /* Called by boot processor to activate the rest. */ | 532 | /* Called by boot processor to activate the rest. */ |
| 528 | void __init smp_init(void) | 533 | void __init smp_init(void) |
| 529 | { | 534 | { |
| @@ -540,7 +545,7 @@ void __init smp_init(void) | |||
| 540 | } | 545 | } |
| 541 | 546 | ||
| 542 | /* Any cleanup work */ | 547 | /* Any cleanup work */ |
| 543 | printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); | 548 | smp_announce(); |
| 544 | smp_cpus_done(setup_max_cpus); | 549 | smp_cpus_done(setup_max_cpus); |
| 545 | } | 550 | } |
| 546 | 551 | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c index d7d498d8cc4f..b24988353458 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -29,7 +29,6 @@ | |||
| 29 | #define CREATE_TRACE_POINTS | 29 | #define CREATE_TRACE_POINTS |
| 30 | #include <trace/events/irq.h> | 30 | #include <trace/events/irq.h> |
| 31 | 31 | ||
| 32 | #include <asm/irq.h> | ||
| 33 | /* | 32 | /* |
| 34 | - No shared variables, all the data are CPU local. | 33 | - No shared variables, all the data are CPU local. |
| 35 | - If a softirq needs serialization, let it serialize itself | 34 | - If a softirq needs serialization, let it serialize itself |
| @@ -100,13 +99,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) | |||
| 100 | 99 | ||
| 101 | raw_local_irq_save(flags); | 100 | raw_local_irq_save(flags); |
| 102 | /* | 101 | /* |
| 103 | * The preempt tracer hooks into add_preempt_count and will break | 102 | * The preempt tracer hooks into preempt_count_add and will break |
| 104 | * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET | 103 | * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET |
| 105 | * is set and before current->softirq_enabled is cleared. | 104 | * is set and before current->softirq_enabled is cleared. |
| 106 | * We must manually increment preempt_count here and manually | 105 | * We must manually increment preempt_count here and manually |
| 107 | * call the trace_preempt_off later. | 106 | * call the trace_preempt_off later. |
| 108 | */ | 107 | */ |
| 109 | preempt_count() += cnt; | 108 | __preempt_count_add(cnt); |
| 110 | /* | 109 | /* |
| 111 | * Were softirqs turned off above: | 110 | * Were softirqs turned off above: |
| 112 | */ | 111 | */ |
| @@ -120,7 +119,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) | |||
| 120 | #else /* !CONFIG_TRACE_IRQFLAGS */ | 119 | #else /* !CONFIG_TRACE_IRQFLAGS */ |
| 121 | static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) | 120 | static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) |
| 122 | { | 121 | { |
| 123 | add_preempt_count(cnt); | 122 | preempt_count_add(cnt); |
| 124 | barrier(); | 123 | barrier(); |
| 125 | } | 124 | } |
| 126 | #endif /* CONFIG_TRACE_IRQFLAGS */ | 125 | #endif /* CONFIG_TRACE_IRQFLAGS */ |
| @@ -134,12 +133,11 @@ EXPORT_SYMBOL(local_bh_disable); | |||
| 134 | 133 | ||
| 135 | static void __local_bh_enable(unsigned int cnt) | 134 | static void __local_bh_enable(unsigned int cnt) |
| 136 | { | 135 | { |
| 137 | WARN_ON_ONCE(in_irq()); | ||
| 138 | WARN_ON_ONCE(!irqs_disabled()); | 136 | WARN_ON_ONCE(!irqs_disabled()); |
| 139 | 137 | ||
| 140 | if (softirq_count() == cnt) | 138 | if (softirq_count() == cnt) |
| 141 | trace_softirqs_on(_RET_IP_); | 139 | trace_softirqs_on(_RET_IP_); |
| 142 | sub_preempt_count(cnt); | 140 | preempt_count_sub(cnt); |
| 143 | } | 141 | } |
| 144 | 142 | ||
| 145 | /* | 143 | /* |
| @@ -149,6 +147,7 @@ static void __local_bh_enable(unsigned int cnt) | |||
| 149 | */ | 147 | */ |
| 150 | void _local_bh_enable(void) | 148 | void _local_bh_enable(void) |
| 151 | { | 149 | { |
| 150 | WARN_ON_ONCE(in_irq()); | ||
| 152 | __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); | 151 | __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); |
| 153 | } | 152 | } |
| 154 | 153 | ||
| @@ -169,12 +168,17 @@ static inline void _local_bh_enable_ip(unsigned long ip) | |||
| 169 | * Keep preemption disabled until we are done with | 168 | * Keep preemption disabled until we are done with |
| 170 | * softirq processing: | 169 | * softirq processing: |
| 171 | */ | 170 | */ |
| 172 | sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); | 171 | preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); |
| 173 | 172 | ||
| 174 | if (unlikely(!in_interrupt() && local_softirq_pending())) | 173 | if (unlikely(!in_interrupt() && local_softirq_pending())) { |
| 174 | /* | ||
| 175 | * Run softirq if any pending. And do it in its own stack | ||
| 176 | * as we may be calling this deep in a task call stack already. | ||
| 177 | */ | ||
| 175 | do_softirq(); | 178 | do_softirq(); |
| 179 | } | ||
| 176 | 180 | ||
| 177 | dec_preempt_count(); | 181 | preempt_count_dec(); |
| 178 | #ifdef CONFIG_TRACE_IRQFLAGS | 182 | #ifdef CONFIG_TRACE_IRQFLAGS |
| 179 | local_irq_enable(); | 183 | local_irq_enable(); |
| 180 | #endif | 184 | #endif |
| @@ -256,7 +260,7 @@ restart: | |||
| 256 | " exited with %08x?\n", vec_nr, | 260 | " exited with %08x?\n", vec_nr, |
| 257 | softirq_to_name[vec_nr], h->action, | 261 | softirq_to_name[vec_nr], h->action, |
| 258 | prev_count, preempt_count()); | 262 | prev_count, preempt_count()); |
| 259 | preempt_count() = prev_count; | 263 | preempt_count_set(prev_count); |
| 260 | } | 264 | } |
| 261 | 265 | ||
| 262 | rcu_bh_qs(cpu); | 266 | rcu_bh_qs(cpu); |
| @@ -280,10 +284,11 @@ restart: | |||
| 280 | 284 | ||
| 281 | account_irq_exit_time(current); | 285 | account_irq_exit_time(current); |
| 282 | __local_bh_enable(SOFTIRQ_OFFSET); | 286 | __local_bh_enable(SOFTIRQ_OFFSET); |
| 287 | WARN_ON_ONCE(in_interrupt()); | ||
| 283 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | 288 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); |
| 284 | } | 289 | } |
| 285 | 290 | ||
| 286 | #ifndef __ARCH_HAS_DO_SOFTIRQ | 291 | |
| 287 | 292 | ||
| 288 | asmlinkage void do_softirq(void) | 293 | asmlinkage void do_softirq(void) |
| 289 | { | 294 | { |
| @@ -298,13 +303,11 @@ asmlinkage void do_softirq(void) | |||
| 298 | pending = local_softirq_pending(); | 303 | pending = local_softirq_pending(); |
| 299 | 304 | ||
| 300 | if (pending) | 305 | if (pending) |
| 301 | __do_softirq(); | 306 | do_softirq_own_stack(); |
| 302 | 307 | ||
| 303 | local_irq_restore(flags); | 308 | local_irq_restore(flags); |
| 304 | } | 309 | } |
| 305 | 310 | ||
| 306 | #endif | ||
| 307 | |||
| 308 | /* | 311 | /* |
| 309 | * Enter an interrupt context. | 312 | * Enter an interrupt context. |
| 310 | */ | 313 | */ |
| @@ -329,15 +332,21 @@ void irq_enter(void) | |||
| 329 | static inline void invoke_softirq(void) | 332 | static inline void invoke_softirq(void) |
| 330 | { | 333 | { |
| 331 | if (!force_irqthreads) { | 334 | if (!force_irqthreads) { |
| 335 | #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK | ||
| 332 | /* | 336 | /* |
| 333 | * We can safely execute softirq on the current stack if | 337 | * We can safely execute softirq on the current stack if |
| 334 | * it is the irq stack, because it should be near empty | 338 | * it is the irq stack, because it should be near empty |
| 335 | * at this stage. But we have no way to know if the arch | 339 | * at this stage. |
| 336 | * calls irq_exit() on the irq stack. So call softirq | ||
| 337 | * in its own stack to prevent from any overrun on top | ||
| 338 | * of a potentially deep task stack. | ||
| 339 | */ | 340 | */ |
| 340 | do_softirq(); | 341 | __do_softirq(); |
| 342 | #else | ||
| 343 | /* | ||
| 344 | * Otherwise, irq_exit() is called on the task stack that can | ||
| 345 | * be potentially deep already. So call softirq in its own stack | ||
| 346 | * to prevent from any overrun. | ||
| 347 | */ | ||
| 348 | do_softirq_own_stack(); | ||
| 349 | #endif | ||
| 341 | } else { | 350 | } else { |
| 342 | wakeup_softirqd(); | 351 | wakeup_softirqd(); |
| 343 | } | 352 | } |
| @@ -369,7 +378,7 @@ void irq_exit(void) | |||
| 369 | 378 | ||
| 370 | account_irq_exit_time(current); | 379 | account_irq_exit_time(current); |
| 371 | trace_hardirq_exit(); | 380 | trace_hardirq_exit(); |
| 372 | sub_preempt_count(HARDIRQ_OFFSET); | 381 | preempt_count_sub(HARDIRQ_OFFSET); |
| 373 | if (!in_interrupt() && local_softirq_pending()) | 382 | if (!in_interrupt() && local_softirq_pending()) |
| 374 | invoke_softirq(); | 383 | invoke_softirq(); |
| 375 | 384 | ||
| @@ -771,6 +780,10 @@ static void run_ksoftirqd(unsigned int cpu) | |||
| 771 | { | 780 | { |
| 772 | local_irq_disable(); | 781 | local_irq_disable(); |
| 773 | if (local_softirq_pending()) { | 782 | if (local_softirq_pending()) { |
| 783 | /* | ||
| 784 | * We can safely run softirq on inline stack, as we are not deep | ||
| 785 | * in the task stack here. | ||
| 786 | */ | ||
| 774 | __do_softirq(); | 787 | __do_softirq(); |
| 775 | rcu_note_context_switch(cpu); | 788 | rcu_note_context_switch(cpu); |
| 776 | local_irq_enable(); | 789 | local_irq_enable(); |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index c09f2955ae30..84571e09c907 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/kallsyms.h> | 20 | #include <linux/kallsyms.h> |
| 21 | #include <linux/smpboot.h> | 21 | #include <linux/smpboot.h> |
| 22 | #include <linux/atomic.h> | 22 | #include <linux/atomic.h> |
| 23 | #include <linux/lglock.h> | ||
| 23 | 24 | ||
| 24 | /* | 25 | /* |
| 25 | * Structure to determine completion condition and record errors. May | 26 | * Structure to determine completion condition and record errors. May |
| @@ -43,6 +44,14 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); | |||
| 43 | static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); | 44 | static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); |
| 44 | static bool stop_machine_initialized = false; | 45 | static bool stop_machine_initialized = false; |
| 45 | 46 | ||
| 47 | /* | ||
| 48 | * Avoids a race between stop_two_cpus and global stop_cpus, where | ||
| 49 | * the stoppers could get queued up in reverse order, leading to | ||
| 50 | * system deadlock. Using an lglock means stop_two_cpus remains | ||
| 51 | * relatively cheap. | ||
| 52 | */ | ||
| 53 | DEFINE_STATIC_LGLOCK(stop_cpus_lock); | ||
| 54 | |||
| 46 | static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) | 55 | static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) |
| 47 | { | 56 | { |
| 48 | memset(done, 0, sizeof(*done)); | 57 | memset(done, 0, sizeof(*done)); |
| @@ -115,6 +124,184 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) | |||
| 115 | return done.executed ? done.ret : -ENOENT; | 124 | return done.executed ? done.ret : -ENOENT; |
| 116 | } | 125 | } |
| 117 | 126 | ||
| 127 | /* This controls the threads on each CPU. */ | ||
| 128 | enum multi_stop_state { | ||
| 129 | /* Dummy starting state for thread. */ | ||
| 130 | MULTI_STOP_NONE, | ||
| 131 | /* Awaiting everyone to be scheduled. */ | ||
| 132 | MULTI_STOP_PREPARE, | ||
| 133 | /* Disable interrupts. */ | ||
| 134 | MULTI_STOP_DISABLE_IRQ, | ||
| 135 | /* Run the function */ | ||
| 136 | MULTI_STOP_RUN, | ||
| 137 | /* Exit */ | ||
| 138 | MULTI_STOP_EXIT, | ||
| 139 | }; | ||
| 140 | |||
| 141 | struct multi_stop_data { | ||
| 142 | int (*fn)(void *); | ||
| 143 | void *data; | ||
| 144 | /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ | ||
| 145 | unsigned int num_threads; | ||
| 146 | const struct cpumask *active_cpus; | ||
| 147 | |||
| 148 | enum multi_stop_state state; | ||
| 149 | atomic_t thread_ack; | ||
| 150 | }; | ||
| 151 | |||
| 152 | static void set_state(struct multi_stop_data *msdata, | ||
| 153 | enum multi_stop_state newstate) | ||
| 154 | { | ||
| 155 | /* Reset ack counter. */ | ||
| 156 | atomic_set(&msdata->thread_ack, msdata->num_threads); | ||
| 157 | smp_wmb(); | ||
| 158 | msdata->state = newstate; | ||
| 159 | } | ||
| 160 | |||
| 161 | /* Last one to ack a state moves to the next state. */ | ||
| 162 | static void ack_state(struct multi_stop_data *msdata) | ||
| 163 | { | ||
| 164 | if (atomic_dec_and_test(&msdata->thread_ack)) | ||
| 165 | set_state(msdata, msdata->state + 1); | ||
| 166 | } | ||
| 167 | |||
| 168 | /* This is the cpu_stop function which stops the CPU. */ | ||
| 169 | static int multi_cpu_stop(void *data) | ||
| 170 | { | ||
| 171 | struct multi_stop_data *msdata = data; | ||
| 172 | enum multi_stop_state curstate = MULTI_STOP_NONE; | ||
| 173 | int cpu = smp_processor_id(), err = 0; | ||
| 174 | unsigned long flags; | ||
| 175 | bool is_active; | ||
| 176 | |||
| 177 | /* | ||
| 178 | * When called from stop_machine_from_inactive_cpu(), irq might | ||
| 179 | * already be disabled. Save the state and restore it on exit. | ||
| 180 | */ | ||
| 181 | local_save_flags(flags); | ||
| 182 | |||
| 183 | if (!msdata->active_cpus) | ||
| 184 | is_active = cpu == cpumask_first(cpu_online_mask); | ||
| 185 | else | ||
| 186 | is_active = cpumask_test_cpu(cpu, msdata->active_cpus); | ||
| 187 | |||
| 188 | /* Simple state machine */ | ||
| 189 | do { | ||
| 190 | /* Chill out and ensure we re-read multi_stop_state. */ | ||
| 191 | cpu_relax(); | ||
| 192 | if (msdata->state != curstate) { | ||
| 193 | curstate = msdata->state; | ||
| 194 | switch (curstate) { | ||
| 195 | case MULTI_STOP_DISABLE_IRQ: | ||
| 196 | local_irq_disable(); | ||
| 197 | hard_irq_disable(); | ||
| 198 | break; | ||
| 199 | case MULTI_STOP_RUN: | ||
| 200 | if (is_active) | ||
| 201 | err = msdata->fn(msdata->data); | ||
| 202 | break; | ||
| 203 | default: | ||
| 204 | break; | ||
| 205 | } | ||
| 206 | ack_state(msdata); | ||
| 207 | } | ||
| 208 | } while (curstate != MULTI_STOP_EXIT); | ||
| 209 | |||
| 210 | local_irq_restore(flags); | ||
| 211 | return err; | ||
| 212 | } | ||
| 213 | |||
| 214 | struct irq_cpu_stop_queue_work_info { | ||
| 215 | int cpu1; | ||
| 216 | int cpu2; | ||
| 217 | struct cpu_stop_work *work1; | ||
| 218 | struct cpu_stop_work *work2; | ||
| 219 | }; | ||
| 220 | |||
| 221 | /* | ||
| 222 | * This function is always run with irqs and preemption disabled. | ||
| 223 | * This guarantees that both work1 and work2 get queued, before | ||
| 224 | * our local migrate thread gets the chance to preempt us. | ||
| 225 | */ | ||
| 226 | static void irq_cpu_stop_queue_work(void *arg) | ||
| 227 | { | ||
| 228 | struct irq_cpu_stop_queue_work_info *info = arg; | ||
| 229 | cpu_stop_queue_work(info->cpu1, info->work1); | ||
| 230 | cpu_stop_queue_work(info->cpu2, info->work2); | ||
| 231 | } | ||
| 232 | |||
| 233 | /** | ||
| 234 | * stop_two_cpus - stops two cpus | ||
| 235 | * @cpu1: the cpu to stop | ||
| 236 | * @cpu2: the other cpu to stop | ||
| 237 | * @fn: function to execute | ||
| 238 | * @arg: argument to @fn | ||
| 239 | * | ||
| 240 | * Stops both the current and specified CPU and runs @fn on one of them. | ||
| 241 | * | ||
| 242 | * returns when both are completed. | ||
| 243 | */ | ||
| 244 | int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg) | ||
| 245 | { | ||
| 246 | struct cpu_stop_done done; | ||
| 247 | struct cpu_stop_work work1, work2; | ||
| 248 | struct irq_cpu_stop_queue_work_info call_args; | ||
| 249 | struct multi_stop_data msdata; | ||
| 250 | |||
| 251 | preempt_disable(); | ||
| 252 | msdata = (struct multi_stop_data){ | ||
| 253 | .fn = fn, | ||
| 254 | .data = arg, | ||
| 255 | .num_threads = 2, | ||
| 256 | .active_cpus = cpumask_of(cpu1), | ||
| 257 | }; | ||
| 258 | |||
| 259 | work1 = work2 = (struct cpu_stop_work){ | ||
| 260 | .fn = multi_cpu_stop, | ||
| 261 | .arg = &msdata, | ||
| 262 | .done = &done | ||
| 263 | }; | ||
| 264 | |||
| 265 | call_args = (struct irq_cpu_stop_queue_work_info){ | ||
| 266 | .cpu1 = cpu1, | ||
| 267 | .cpu2 = cpu2, | ||
| 268 | .work1 = &work1, | ||
| 269 | .work2 = &work2, | ||
| 270 | }; | ||
| 271 | |||
| 272 | cpu_stop_init_done(&done, 2); | ||
| 273 | set_state(&msdata, MULTI_STOP_PREPARE); | ||
| 274 | |||
| 275 | /* | ||
| 276 | * If we observe both CPUs active we know _cpu_down() cannot yet have | ||
| 277 | * queued its stop_machine works and therefore ours will get executed | ||
| 278 | * first. Or its not either one of our CPUs that's getting unplugged, | ||
| 279 | * in which case we don't care. | ||
| 280 | * | ||
| 281 | * This relies on the stopper workqueues to be FIFO. | ||
| 282 | */ | ||
| 283 | if (!cpu_active(cpu1) || !cpu_active(cpu2)) { | ||
| 284 | preempt_enable(); | ||
| 285 | return -ENOENT; | ||
| 286 | } | ||
| 287 | |||
| 288 | lg_local_lock(&stop_cpus_lock); | ||
| 289 | /* | ||
| 290 | * Queuing needs to be done by the lowest numbered CPU, to ensure | ||
| 291 | * that works are always queued in the same order on every CPU. | ||
| 292 | * This prevents deadlocks. | ||
| 293 | */ | ||
| 294 | smp_call_function_single(min(cpu1, cpu2), | ||
| 295 | &irq_cpu_stop_queue_work, | ||
| 296 | &call_args, 0); | ||
| 297 | lg_local_unlock(&stop_cpus_lock); | ||
| 298 | preempt_enable(); | ||
| 299 | |||
| 300 | wait_for_completion(&done.completion); | ||
| 301 | |||
| 302 | return done.executed ? done.ret : -ENOENT; | ||
| 303 | } | ||
| 304 | |||
| 118 | /** | 305 | /** |
| 119 | * stop_one_cpu_nowait - stop a cpu but don't wait for completion | 306 | * stop_one_cpu_nowait - stop a cpu but don't wait for completion |
| 120 | * @cpu: cpu to stop | 307 | * @cpu: cpu to stop |
| @@ -159,10 +346,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, | |||
| 159 | * preempted by a stopper which might wait for other stoppers | 346 | * preempted by a stopper which might wait for other stoppers |
| 160 | * to enter @fn which can lead to deadlock. | 347 | * to enter @fn which can lead to deadlock. |
| 161 | */ | 348 | */ |
| 162 | preempt_disable(); | 349 | lg_global_lock(&stop_cpus_lock); |
| 163 | for_each_cpu(cpu, cpumask) | 350 | for_each_cpu(cpu, cpumask) |
| 164 | cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); | 351 | cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); |
| 165 | preempt_enable(); | 352 | lg_global_unlock(&stop_cpus_lock); |
| 166 | } | 353 | } |
| 167 | 354 | ||
| 168 | static int __stop_cpus(const struct cpumask *cpumask, | 355 | static int __stop_cpus(const struct cpumask *cpumask, |
| @@ -359,98 +546,14 @@ early_initcall(cpu_stop_init); | |||
| 359 | 546 | ||
| 360 | #ifdef CONFIG_STOP_MACHINE | 547 | #ifdef CONFIG_STOP_MACHINE |
| 361 | 548 | ||
| 362 | /* This controls the threads on each CPU. */ | ||
| 363 | enum stopmachine_state { | ||
| 364 | /* Dummy starting state for thread. */ | ||
| 365 | STOPMACHINE_NONE, | ||
| 366 | /* Awaiting everyone to be scheduled. */ | ||
| 367 | STOPMACHINE_PREPARE, | ||
| 368 | /* Disable interrupts. */ | ||
| 369 | STOPMACHINE_DISABLE_IRQ, | ||
| 370 | /* Run the function */ | ||
| 371 | STOPMACHINE_RUN, | ||
| 372 | /* Exit */ | ||
| 373 | STOPMACHINE_EXIT, | ||
| 374 | }; | ||
| 375 | |||
| 376 | struct stop_machine_data { | ||
| 377 | int (*fn)(void *); | ||
| 378 | void *data; | ||
| 379 | /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ | ||
| 380 | unsigned int num_threads; | ||
| 381 | const struct cpumask *active_cpus; | ||
| 382 | |||
| 383 | enum stopmachine_state state; | ||
| 384 | atomic_t thread_ack; | ||
| 385 | }; | ||
| 386 | |||
| 387 | static void set_state(struct stop_machine_data *smdata, | ||
| 388 | enum stopmachine_state newstate) | ||
| 389 | { | ||
| 390 | /* Reset ack counter. */ | ||
| 391 | atomic_set(&smdata->thread_ack, smdata->num_threads); | ||
| 392 | smp_wmb(); | ||
| 393 | smdata->state = newstate; | ||
| 394 | } | ||
| 395 | |||
| 396 | /* Last one to ack a state moves to the next state. */ | ||
| 397 | static void ack_state(struct stop_machine_data *smdata) | ||
| 398 | { | ||
| 399 | if (atomic_dec_and_test(&smdata->thread_ack)) | ||
| 400 | set_state(smdata, smdata->state + 1); | ||
| 401 | } | ||
| 402 | |||
| 403 | /* This is the cpu_stop function which stops the CPU. */ | ||
| 404 | static int stop_machine_cpu_stop(void *data) | ||
| 405 | { | ||
| 406 | struct stop_machine_data *smdata = data; | ||
| 407 | enum stopmachine_state curstate = STOPMACHINE_NONE; | ||
| 408 | int cpu = smp_processor_id(), err = 0; | ||
| 409 | unsigned long flags; | ||
| 410 | bool is_active; | ||
| 411 | |||
| 412 | /* | ||
| 413 | * When called from stop_machine_from_inactive_cpu(), irq might | ||
| 414 | * already be disabled. Save the state and restore it on exit. | ||
| 415 | */ | ||
| 416 | local_save_flags(flags); | ||
| 417 | |||
| 418 | if (!smdata->active_cpus) | ||
| 419 | is_active = cpu == cpumask_first(cpu_online_mask); | ||
| 420 | else | ||
| 421 | is_active = cpumask_test_cpu(cpu, smdata->active_cpus); | ||
| 422 | |||
| 423 | /* Simple state machine */ | ||
| 424 | do { | ||
| 425 | /* Chill out and ensure we re-read stopmachine_state. */ | ||
| 426 | cpu_relax(); | ||
| 427 | if (smdata->state != curstate) { | ||
| 428 | curstate = smdata->state; | ||
| 429 | switch (curstate) { | ||
| 430 | case STOPMACHINE_DISABLE_IRQ: | ||
| 431 | local_irq_disable(); | ||
| 432 | hard_irq_disable(); | ||
| 433 | break; | ||
| 434 | case STOPMACHINE_RUN: | ||
| 435 | if (is_active) | ||
| 436 | err = smdata->fn(smdata->data); | ||
| 437 | break; | ||
| 438 | default: | ||
| 439 | break; | ||
| 440 | } | ||
| 441 | ack_state(smdata); | ||
| 442 | } | ||
| 443 | } while (curstate != STOPMACHINE_EXIT); | ||
| 444 | |||
| 445 | local_irq_restore(flags); | ||
| 446 | return err; | ||
| 447 | } | ||
| 448 | |||
| 449 | int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | 549 | int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) |
| 450 | { | 550 | { |
| 451 | struct stop_machine_data smdata = { .fn = fn, .data = data, | 551 | struct multi_stop_data msdata = { |
| 452 | .num_threads = num_online_cpus(), | 552 | .fn = fn, |
| 453 | .active_cpus = cpus }; | 553 | .data = data, |
| 554 | .num_threads = num_online_cpus(), | ||
| 555 | .active_cpus = cpus, | ||
| 556 | }; | ||
| 454 | 557 | ||
| 455 | if (!stop_machine_initialized) { | 558 | if (!stop_machine_initialized) { |
| 456 | /* | 559 | /* |
| @@ -461,7 +564,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | |||
| 461 | unsigned long flags; | 564 | unsigned long flags; |
| 462 | int ret; | 565 | int ret; |
| 463 | 566 | ||
| 464 | WARN_ON_ONCE(smdata.num_threads != 1); | 567 | WARN_ON_ONCE(msdata.num_threads != 1); |
| 465 | 568 | ||
| 466 | local_irq_save(flags); | 569 | local_irq_save(flags); |
| 467 | hard_irq_disable(); | 570 | hard_irq_disable(); |
| @@ -472,8 +575,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | |||
| 472 | } | 575 | } |
| 473 | 576 | ||
| 474 | /* Set the initial state and stop all online cpus. */ | 577 | /* Set the initial state and stop all online cpus. */ |
| 475 | set_state(&smdata, STOPMACHINE_PREPARE); | 578 | set_state(&msdata, MULTI_STOP_PREPARE); |
| 476 | return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); | 579 | return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata); |
| 477 | } | 580 | } |
| 478 | 581 | ||
| 479 | int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | 582 | int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) |
| @@ -513,25 +616,25 @@ EXPORT_SYMBOL_GPL(stop_machine); | |||
| 513 | int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, | 616 | int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, |
| 514 | const struct cpumask *cpus) | 617 | const struct cpumask *cpus) |
| 515 | { | 618 | { |
| 516 | struct stop_machine_data smdata = { .fn = fn, .data = data, | 619 | struct multi_stop_data msdata = { .fn = fn, .data = data, |
| 517 | .active_cpus = cpus }; | 620 | .active_cpus = cpus }; |
| 518 | struct cpu_stop_done done; | 621 | struct cpu_stop_done done; |
| 519 | int ret; | 622 | int ret; |
| 520 | 623 | ||
| 521 | /* Local CPU must be inactive and CPU hotplug in progress. */ | 624 | /* Local CPU must be inactive and CPU hotplug in progress. */ |
| 522 | BUG_ON(cpu_active(raw_smp_processor_id())); | 625 | BUG_ON(cpu_active(raw_smp_processor_id())); |
| 523 | smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ | 626 | msdata.num_threads = num_active_cpus() + 1; /* +1 for local */ |
| 524 | 627 | ||
| 525 | /* No proper task established and can't sleep - busy wait for lock. */ | 628 | /* No proper task established and can't sleep - busy wait for lock. */ |
| 526 | while (!mutex_trylock(&stop_cpus_mutex)) | 629 | while (!mutex_trylock(&stop_cpus_mutex)) |
| 527 | cpu_relax(); | 630 | cpu_relax(); |
| 528 | 631 | ||
| 529 | /* Schedule work on other CPUs and execute directly for local CPU */ | 632 | /* Schedule work on other CPUs and execute directly for local CPU */ |
| 530 | set_state(&smdata, STOPMACHINE_PREPARE); | 633 | set_state(&msdata, MULTI_STOP_PREPARE); |
| 531 | cpu_stop_init_done(&done, num_active_cpus()); | 634 | cpu_stop_init_done(&done, num_active_cpus()); |
| 532 | queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, | 635 | queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata, |
| 533 | &done); | 636 | &done); |
| 534 | ret = stop_machine_cpu_stop(&smdata); | 637 | ret = multi_cpu_stop(&msdata); |
| 535 | 638 | ||
| 536 | /* Busy wait for completion. */ | 639 | /* Busy wait for completion. */ |
| 537 | while (!completion_done(&done.completion)) | 640 | while (!completion_done(&done.completion)) |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b2f06f3c6a3f..36547dddcdb8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -190,7 +190,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, | |||
| 190 | 190 | ||
| 191 | #ifdef CONFIG_MAGIC_SYSRQ | 191 | #ifdef CONFIG_MAGIC_SYSRQ |
| 192 | /* Note: sysrq code uses it's own private copy */ | 192 | /* Note: sysrq code uses it's own private copy */ |
| 193 | static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; | 193 | static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; |
| 194 | 194 | ||
| 195 | static int sysrq_sysctl_handler(ctl_table *table, int write, | 195 | static int sysrq_sysctl_handler(ctl_table *table, int write, |
| 196 | void __user *buffer, size_t *lenp, | 196 | void __user *buffer, size_t *lenp, |
| @@ -371,13 +371,6 @@ static struct ctl_table kern_table[] = { | |||
| 371 | .proc_handler = proc_dointvec, | 371 | .proc_handler = proc_dointvec, |
| 372 | }, | 372 | }, |
| 373 | { | 373 | { |
| 374 | .procname = "numa_balancing_scan_period_reset", | ||
| 375 | .data = &sysctl_numa_balancing_scan_period_reset, | ||
| 376 | .maxlen = sizeof(unsigned int), | ||
| 377 | .mode = 0644, | ||
| 378 | .proc_handler = proc_dointvec, | ||
| 379 | }, | ||
| 380 | { | ||
| 381 | .procname = "numa_balancing_scan_period_max_ms", | 374 | .procname = "numa_balancing_scan_period_max_ms", |
| 382 | .data = &sysctl_numa_balancing_scan_period_max, | 375 | .data = &sysctl_numa_balancing_scan_period_max, |
| 383 | .maxlen = sizeof(unsigned int), | 376 | .maxlen = sizeof(unsigned int), |
| @@ -391,6 +384,20 @@ static struct ctl_table kern_table[] = { | |||
| 391 | .mode = 0644, | 384 | .mode = 0644, |
| 392 | .proc_handler = proc_dointvec, | 385 | .proc_handler = proc_dointvec, |
| 393 | }, | 386 | }, |
| 387 | { | ||
| 388 | .procname = "numa_balancing_settle_count", | ||
| 389 | .data = &sysctl_numa_balancing_settle_count, | ||
| 390 | .maxlen = sizeof(unsigned int), | ||
| 391 | .mode = 0644, | ||
| 392 | .proc_handler = proc_dointvec, | ||
| 393 | }, | ||
| 394 | { | ||
| 395 | .procname = "numa_balancing_migrate_deferred", | ||
| 396 | .data = &sysctl_numa_balancing_migrate_deferred, | ||
| 397 | .maxlen = sizeof(unsigned int), | ||
| 398 | .mode = 0644, | ||
| 399 | .proc_handler = proc_dointvec, | ||
| 400 | }, | ||
| 394 | #endif /* CONFIG_NUMA_BALANCING */ | 401 | #endif /* CONFIG_NUMA_BALANCING */ |
| 395 | #endif /* CONFIG_SCHED_DEBUG */ | 402 | #endif /* CONFIG_SCHED_DEBUG */ |
| 396 | { | 403 | { |
| @@ -1049,6 +1056,7 @@ static struct ctl_table kern_table[] = { | |||
| 1049 | .maxlen = sizeof(sysctl_perf_event_sample_rate), | 1056 | .maxlen = sizeof(sysctl_perf_event_sample_rate), |
| 1050 | .mode = 0644, | 1057 | .mode = 0644, |
| 1051 | .proc_handler = perf_proc_update_handler, | 1058 | .proc_handler = perf_proc_update_handler, |
| 1059 | .extra1 = &one, | ||
| 1052 | }, | 1060 | }, |
| 1053 | { | 1061 | { |
| 1054 | .procname = "perf_cpu_time_max_percent", | 1062 | .procname = "perf_cpu_time_max_percent", |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 2b62fe86f9ec..3ce6e8c5f3fc 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
| @@ -100,7 +100,7 @@ config NO_HZ_FULL | |||
| 100 | # RCU_USER_QS dependency | 100 | # RCU_USER_QS dependency |
| 101 | depends on HAVE_CONTEXT_TRACKING | 101 | depends on HAVE_CONTEXT_TRACKING |
| 102 | # VIRT_CPU_ACCOUNTING_GEN dependency | 102 | # VIRT_CPU_ACCOUNTING_GEN dependency |
| 103 | depends on 64BIT | 103 | depends on HAVE_VIRT_CPU_ACCOUNTING_GEN |
| 104 | select NO_HZ_COMMON | 104 | select NO_HZ_COMMON |
| 105 | select RCU_USER_QS | 105 | select RCU_USER_QS |
| 106 | select RCU_NOCB_CPU | 106 | select RCU_NOCB_CPU |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index eec50fcef9e4..88c9c65a430d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
| @@ -490,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) | |||
| 490 | clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; | 490 | clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; |
| 491 | 491 | ||
| 492 | if (!alarmtimer_get_rtcdev()) | 492 | if (!alarmtimer_get_rtcdev()) |
| 493 | return -ENOTSUPP; | 493 | return -EINVAL; |
| 494 | 494 | ||
| 495 | return hrtimer_get_res(baseid, tp); | 495 | return hrtimer_get_res(baseid, tp); |
| 496 | } | 496 | } |
| @@ -507,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) | |||
| 507 | struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; | 507 | struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; |
| 508 | 508 | ||
| 509 | if (!alarmtimer_get_rtcdev()) | 509 | if (!alarmtimer_get_rtcdev()) |
| 510 | return -ENOTSUPP; | 510 | return -EINVAL; |
| 511 | 511 | ||
| 512 | *tp = ktime_to_timespec(base->gettime()); | 512 | *tp = ktime_to_timespec(base->gettime()); |
| 513 | return 0; | 513 | return 0; |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 38959c866789..086ad6043bcb 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
| @@ -33,29 +33,64 @@ struct ce_unbind { | |||
| 33 | int res; | 33 | int res; |
| 34 | }; | 34 | }; |
| 35 | 35 | ||
| 36 | /** | 36 | static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, |
| 37 | * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds | 37 | bool ismax) |
| 38 | * @latch: value to convert | ||
| 39 | * @evt: pointer to clock event device descriptor | ||
| 40 | * | ||
| 41 | * Math helper, returns latch value converted to nanoseconds (bound checked) | ||
| 42 | */ | ||
| 43 | u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) | ||
| 44 | { | 38 | { |
| 45 | u64 clc = (u64) latch << evt->shift; | 39 | u64 clc = (u64) latch << evt->shift; |
| 40 | u64 rnd; | ||
| 46 | 41 | ||
| 47 | if (unlikely(!evt->mult)) { | 42 | if (unlikely(!evt->mult)) { |
| 48 | evt->mult = 1; | 43 | evt->mult = 1; |
| 49 | WARN_ON(1); | 44 | WARN_ON(1); |
| 50 | } | 45 | } |
| 46 | rnd = (u64) evt->mult - 1; | ||
| 47 | |||
| 48 | /* | ||
| 49 | * Upper bound sanity check. If the backwards conversion is | ||
| 50 | * not equal latch, we know that the above shift overflowed. | ||
| 51 | */ | ||
| 52 | if ((clc >> evt->shift) != (u64)latch) | ||
| 53 | clc = ~0ULL; | ||
| 54 | |||
| 55 | /* | ||
| 56 | * Scaled math oddities: | ||
| 57 | * | ||
| 58 | * For mult <= (1 << shift) we can safely add mult - 1 to | ||
| 59 | * prevent integer rounding loss. So the backwards conversion | ||
| 60 | * from nsec to device ticks will be correct. | ||
| 61 | * | ||
| 62 | * For mult > (1 << shift), i.e. device frequency is > 1GHz we | ||
| 63 | * need to be careful. Adding mult - 1 will result in a value | ||
| 64 | * which when converted back to device ticks can be larger | ||
| 65 | * than latch by up to (mult - 1) >> shift. For the min_delta | ||
| 66 | * calculation we still want to apply this in order to stay | ||
| 67 | * above the minimum device ticks limit. For the upper limit | ||
| 68 | * we would end up with a latch value larger than the upper | ||
| 69 | * limit of the device, so we omit the add to stay below the | ||
| 70 | * device upper boundary. | ||
| 71 | * | ||
| 72 | * Also omit the add if it would overflow the u64 boundary. | ||
| 73 | */ | ||
| 74 | if ((~0ULL - clc > rnd) && | ||
| 75 | (!ismax || evt->mult <= (1U << evt->shift))) | ||
| 76 | clc += rnd; | ||
| 51 | 77 | ||
| 52 | do_div(clc, evt->mult); | 78 | do_div(clc, evt->mult); |
| 53 | if (clc < 1000) | ||
| 54 | clc = 1000; | ||
| 55 | if (clc > KTIME_MAX) | ||
| 56 | clc = KTIME_MAX; | ||
| 57 | 79 | ||
| 58 | return clc; | 80 | /* Deltas less than 1usec are pointless noise */ |
| 81 | return clc > 1000 ? clc : 1000; | ||
| 82 | } | ||
| 83 | |||
| 84 | /** | ||
| 85 | * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds | ||
| 86 | * @latch: value to convert | ||
| 87 | * @evt: pointer to clock event device descriptor | ||
| 88 | * | ||
| 89 | * Math helper, returns latch value converted to nanoseconds (bound checked) | ||
| 90 | */ | ||
| 91 | u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) | ||
| 92 | { | ||
| 93 | return cev_delta2ns(latch, evt, false); | ||
| 59 | } | 94 | } |
| 60 | EXPORT_SYMBOL_GPL(clockevent_delta2ns); | 95 | EXPORT_SYMBOL_GPL(clockevent_delta2ns); |
| 61 | 96 | ||
| @@ -380,8 +415,8 @@ void clockevents_config(struct clock_event_device *dev, u32 freq) | |||
| 380 | sec = 600; | 415 | sec = 600; |
| 381 | 416 | ||
| 382 | clockevents_calc_mult_shift(dev, freq, sec); | 417 | clockevents_calc_mult_shift(dev, freq, sec); |
| 383 | dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); | 418 | dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); |
| 384 | dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); | 419 | dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); |
| 385 | } | 420 | } |
| 386 | 421 | ||
| 387 | /** | 422 | /** |
| @@ -584,7 +619,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev, | |||
| 584 | const char *buf, size_t count) | 619 | const char *buf, size_t count) |
| 585 | { | 620 | { |
| 586 | char name[CS_NAME_LEN]; | 621 | char name[CS_NAME_LEN]; |
| 587 | size_t ret = sysfs_get_uname(buf, name, count); | 622 | ssize_t ret = sysfs_get_uname(buf, name, count); |
| 588 | struct clock_event_device *ce; | 623 | struct clock_event_device *ce; |
| 589 | 624 | ||
| 590 | if (ret < 0) | 625 | if (ret < 0) |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 50a8736757f3..ba3e502c955a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -479,6 +479,7 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } | |||
| 479 | static inline void clocksource_resume_watchdog(void) { } | 479 | static inline void clocksource_resume_watchdog(void) { } |
| 480 | static inline int __clocksource_watchdog_kthread(void) { return 0; } | 480 | static inline int __clocksource_watchdog_kthread(void) { return 0; } |
| 481 | static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } | 481 | static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } |
| 482 | void clocksource_mark_unstable(struct clocksource *cs) { } | ||
| 482 | 483 | ||
| 483 | #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ | 484 | #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ |
| 484 | 485 | ||
| @@ -537,40 +538,55 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) | |||
| 537 | } | 538 | } |
| 538 | 539 | ||
| 539 | /** | 540 | /** |
| 540 | * clocksource_max_deferment - Returns max time the clocksource can be deferred | 541 | * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted |
| 541 | * @cs: Pointer to clocksource | 542 | * @mult: cycle to nanosecond multiplier |
| 542 | * | 543 | * @shift: cycle to nanosecond divisor (power of two) |
| 544 | * @maxadj: maximum adjustment value to mult (~11%) | ||
| 545 | * @mask: bitmask for two's complement subtraction of non 64 bit counters | ||
| 543 | */ | 546 | */ |
| 544 | static u64 clocksource_max_deferment(struct clocksource *cs) | 547 | u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) |
| 545 | { | 548 | { |
| 546 | u64 max_nsecs, max_cycles; | 549 | u64 max_nsecs, max_cycles; |
| 547 | 550 | ||
| 548 | /* | 551 | /* |
| 549 | * Calculate the maximum number of cycles that we can pass to the | 552 | * Calculate the maximum number of cycles that we can pass to the |
| 550 | * cyc2ns function without overflowing a 64-bit signed result. The | 553 | * cyc2ns function without overflowing a 64-bit signed result. The |
| 551 | * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) | 554 | * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj) |
| 552 | * which is equivalent to the below. | 555 | * which is equivalent to the below. |
| 553 | * max_cycles < (2^63)/(cs->mult + cs->maxadj) | 556 | * max_cycles < (2^63)/(mult + maxadj) |
| 554 | * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) | 557 | * max_cycles < 2^(log2((2^63)/(mult + maxadj))) |
| 555 | * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) | 558 | * max_cycles < 2^(log2(2^63) - log2(mult + maxadj)) |
| 556 | * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) | 559 | * max_cycles < 2^(63 - log2(mult + maxadj)) |
| 557 | * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) | 560 | * max_cycles < 1 << (63 - log2(mult + maxadj)) |
| 558 | * Please note that we add 1 to the result of the log2 to account for | 561 | * Please note that we add 1 to the result of the log2 to account for |
| 559 | * any rounding errors, ensure the above inequality is satisfied and | 562 | * any rounding errors, ensure the above inequality is satisfied and |
| 560 | * no overflow will occur. | 563 | * no overflow will occur. |
| 561 | */ | 564 | */ |
| 562 | max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); | 565 | max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1)); |
| 563 | 566 | ||
| 564 | /* | 567 | /* |
| 565 | * The actual maximum number of cycles we can defer the clocksource is | 568 | * The actual maximum number of cycles we can defer the clocksource is |
| 566 | * determined by the minimum of max_cycles and cs->mask. | 569 | * determined by the minimum of max_cycles and mask. |
| 567 | * Note: Here we subtract the maxadj to make sure we don't sleep for | 570 | * Note: Here we subtract the maxadj to make sure we don't sleep for |
| 568 | * too long if there's a large negative adjustment. | 571 | * too long if there's a large negative adjustment. |
| 569 | */ | 572 | */ |
| 570 | max_cycles = min_t(u64, max_cycles, (u64) cs->mask); | 573 | max_cycles = min(max_cycles, mask); |
| 571 | max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, | 574 | max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); |
| 572 | cs->shift); | 575 | |
| 576 | return max_nsecs; | ||
| 577 | } | ||
| 578 | |||
| 579 | /** | ||
| 580 | * clocksource_max_deferment - Returns max time the clocksource can be deferred | ||
| 581 | * @cs: Pointer to clocksource | ||
| 582 | * | ||
| 583 | */ | ||
| 584 | static u64 clocksource_max_deferment(struct clocksource *cs) | ||
| 585 | { | ||
| 586 | u64 max_nsecs; | ||
| 573 | 587 | ||
| 588 | max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, | ||
| 589 | cs->mask); | ||
| 574 | /* | 590 | /* |
| 575 | * To ensure that the clocksource does not wrap whilst we are idle, | 591 | * To ensure that the clocksource does not wrap whilst we are idle, |
| 576 | * limit the time the clocksource can be deferred by 12.5%. Please | 592 | * limit the time the clocksource can be deferred by 12.5%. Please |
| @@ -893,7 +909,7 @@ sysfs_show_current_clocksources(struct device *dev, | |||
| 893 | return count; | 909 | return count; |
| 894 | } | 910 | } |
| 895 | 911 | ||
| 896 | size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) | 912 | ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) |
| 897 | { | 913 | { |
| 898 | size_t ret = cnt; | 914 | size_t ret = cnt; |
| 899 | 915 | ||
| @@ -924,7 +940,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, | |||
| 924 | struct device_attribute *attr, | 940 | struct device_attribute *attr, |
| 925 | const char *buf, size_t count) | 941 | const char *buf, size_t count) |
| 926 | { | 942 | { |
| 927 | size_t ret; | 943 | ssize_t ret; |
| 928 | 944 | ||
| 929 | mutex_lock(&clocksource_mutex); | 945 | mutex_lock(&clocksource_mutex); |
| 930 | 946 | ||
| @@ -952,7 +968,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev, | |||
| 952 | { | 968 | { |
| 953 | struct clocksource *cs; | 969 | struct clocksource *cs; |
| 954 | char name[CS_NAME_LEN]; | 970 | char name[CS_NAME_LEN]; |
| 955 | size_t ret; | 971 | ssize_t ret; |
| 956 | 972 | ||
| 957 | ret = sysfs_get_uname(buf, name, count); | 973 | ret = sysfs_get_uname(buf, name, count); |
| 958 | if (ret < 0) | 974 | if (ret < 0) |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index bb2215174f05..af8d1d4f3d55 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
| @@ -475,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work) | |||
| 475 | * called as close as possible to 500 ms before the new second starts. | 475 | * called as close as possible to 500 ms before the new second starts. |
| 476 | * This code is run on a timer. If the clock is set, that timer | 476 | * This code is run on a timer. If the clock is set, that timer |
| 477 | * may not expire at the correct time. Thus, we adjust... | 477 | * may not expire at the correct time. Thus, we adjust... |
| 478 | * We want the clock to be within a couple of ticks from the target. | ||
| 478 | */ | 479 | */ |
| 479 | if (!ntp_synced()) { | 480 | if (!ntp_synced()) { |
| 480 | /* | 481 | /* |
| @@ -485,7 +486,7 @@ static void sync_cmos_clock(struct work_struct *work) | |||
| 485 | } | 486 | } |
| 486 | 487 | ||
| 487 | getnstimeofday(&now); | 488 | getnstimeofday(&now); |
| 488 | if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { | 489 | if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { |
| 489 | struct timespec adjust = now; | 490 | struct timespec adjust = now; |
| 490 | 491 | ||
| 491 | fail = -ENODEV; | 492 | fail = -ENODEV; |
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0b479a6a22bb..68b799375981 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c | |||
| @@ -8,25 +8,28 @@ | |||
| 8 | #include <linux/clocksource.h> | 8 | #include <linux/clocksource.h> |
| 9 | #include <linux/init.h> | 9 | #include <linux/init.h> |
| 10 | #include <linux/jiffies.h> | 10 | #include <linux/jiffies.h> |
| 11 | #include <linux/ktime.h> | ||
| 11 | #include <linux/kernel.h> | 12 | #include <linux/kernel.h> |
| 12 | #include <linux/moduleparam.h> | 13 | #include <linux/moduleparam.h> |
| 13 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
| 14 | #include <linux/syscore_ops.h> | 15 | #include <linux/syscore_ops.h> |
| 15 | #include <linux/timer.h> | 16 | #include <linux/hrtimer.h> |
| 16 | #include <linux/sched_clock.h> | 17 | #include <linux/sched_clock.h> |
| 18 | #include <linux/seqlock.h> | ||
| 19 | #include <linux/bitops.h> | ||
| 17 | 20 | ||
| 18 | struct clock_data { | 21 | struct clock_data { |
| 22 | ktime_t wrap_kt; | ||
| 19 | u64 epoch_ns; | 23 | u64 epoch_ns; |
| 20 | u32 epoch_cyc; | 24 | u64 epoch_cyc; |
| 21 | u32 epoch_cyc_copy; | 25 | seqcount_t seq; |
| 22 | unsigned long rate; | 26 | unsigned long rate; |
| 23 | u32 mult; | 27 | u32 mult; |
| 24 | u32 shift; | 28 | u32 shift; |
| 25 | bool suspended; | 29 | bool suspended; |
| 26 | }; | 30 | }; |
| 27 | 31 | ||
| 28 | static void sched_clock_poll(unsigned long wrap_ticks); | 32 | static struct hrtimer sched_clock_timer; |
| 29 | static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0); | ||
| 30 | static int irqtime = -1; | 33 | static int irqtime = -1; |
| 31 | 34 | ||
| 32 | core_param(irqtime, irqtime, int, 0400); | 35 | core_param(irqtime, irqtime, int, 0400); |
| @@ -35,42 +38,46 @@ static struct clock_data cd = { | |||
| 35 | .mult = NSEC_PER_SEC / HZ, | 38 | .mult = NSEC_PER_SEC / HZ, |
| 36 | }; | 39 | }; |
| 37 | 40 | ||
| 38 | static u32 __read_mostly sched_clock_mask = 0xffffffff; | 41 | static u64 __read_mostly sched_clock_mask; |
| 39 | 42 | ||
| 40 | static u32 notrace jiffy_sched_clock_read(void) | 43 | static u64 notrace jiffy_sched_clock_read(void) |
| 41 | { | 44 | { |
| 42 | return (u32)(jiffies - INITIAL_JIFFIES); | 45 | /* |
| 46 | * We don't need to use get_jiffies_64 on 32-bit arches here | ||
| 47 | * because we register with BITS_PER_LONG | ||
| 48 | */ | ||
| 49 | return (u64)(jiffies - INITIAL_JIFFIES); | ||
| 43 | } | 50 | } |
| 44 | 51 | ||
| 45 | static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; | 52 | static u32 __read_mostly (*read_sched_clock_32)(void); |
| 53 | |||
| 54 | static u64 notrace read_sched_clock_32_wrapper(void) | ||
| 55 | { | ||
| 56 | return read_sched_clock_32(); | ||
| 57 | } | ||
| 58 | |||
| 59 | static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; | ||
| 46 | 60 | ||
| 47 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) | 61 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) |
| 48 | { | 62 | { |
| 49 | return (cyc * mult) >> shift; | 63 | return (cyc * mult) >> shift; |
| 50 | } | 64 | } |
| 51 | 65 | ||
| 52 | static unsigned long long notrace sched_clock_32(void) | 66 | unsigned long long notrace sched_clock(void) |
| 53 | { | 67 | { |
| 54 | u64 epoch_ns; | 68 | u64 epoch_ns; |
| 55 | u32 epoch_cyc; | 69 | u64 epoch_cyc; |
| 56 | u32 cyc; | 70 | u64 cyc; |
| 71 | unsigned long seq; | ||
| 57 | 72 | ||
| 58 | if (cd.suspended) | 73 | if (cd.suspended) |
| 59 | return cd.epoch_ns; | 74 | return cd.epoch_ns; |
| 60 | 75 | ||
| 61 | /* | ||
| 62 | * Load the epoch_cyc and epoch_ns atomically. We do this by | ||
| 63 | * ensuring that we always write epoch_cyc, epoch_ns and | ||
| 64 | * epoch_cyc_copy in strict order, and read them in strict order. | ||
| 65 | * If epoch_cyc and epoch_cyc_copy are not equal, then we're in | ||
| 66 | * the middle of an update, and we should repeat the load. | ||
| 67 | */ | ||
| 68 | do { | 76 | do { |
| 77 | seq = read_seqcount_begin(&cd.seq); | ||
| 69 | epoch_cyc = cd.epoch_cyc; | 78 | epoch_cyc = cd.epoch_cyc; |
| 70 | smp_rmb(); | ||
| 71 | epoch_ns = cd.epoch_ns; | 79 | epoch_ns = cd.epoch_ns; |
| 72 | smp_rmb(); | 80 | } while (read_seqcount_retry(&cd.seq, seq)); |
| 73 | } while (epoch_cyc != cd.epoch_cyc_copy); | ||
| 74 | 81 | ||
| 75 | cyc = read_sched_clock(); | 82 | cyc = read_sched_clock(); |
| 76 | cyc = (cyc - epoch_cyc) & sched_clock_mask; | 83 | cyc = (cyc - epoch_cyc) & sched_clock_mask; |
| @@ -83,49 +90,46 @@ static unsigned long long notrace sched_clock_32(void) | |||
| 83 | static void notrace update_sched_clock(void) | 90 | static void notrace update_sched_clock(void) |
| 84 | { | 91 | { |
| 85 | unsigned long flags; | 92 | unsigned long flags; |
| 86 | u32 cyc; | 93 | u64 cyc; |
| 87 | u64 ns; | 94 | u64 ns; |
| 88 | 95 | ||
| 89 | cyc = read_sched_clock(); | 96 | cyc = read_sched_clock(); |
| 90 | ns = cd.epoch_ns + | 97 | ns = cd.epoch_ns + |
| 91 | cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, | 98 | cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, |
| 92 | cd.mult, cd.shift); | 99 | cd.mult, cd.shift); |
| 93 | /* | 100 | |
| 94 | * Write epoch_cyc and epoch_ns in a way that the update is | ||
| 95 | * detectable in cyc_to_fixed_sched_clock(). | ||
| 96 | */ | ||
| 97 | raw_local_irq_save(flags); | 101 | raw_local_irq_save(flags); |
| 98 | cd.epoch_cyc_copy = cyc; | 102 | write_seqcount_begin(&cd.seq); |
| 99 | smp_wmb(); | ||
| 100 | cd.epoch_ns = ns; | 103 | cd.epoch_ns = ns; |
| 101 | smp_wmb(); | ||
| 102 | cd.epoch_cyc = cyc; | 104 | cd.epoch_cyc = cyc; |
| 105 | write_seqcount_end(&cd.seq); | ||
| 103 | raw_local_irq_restore(flags); | 106 | raw_local_irq_restore(flags); |
| 104 | } | 107 | } |
| 105 | 108 | ||
| 106 | static void sched_clock_poll(unsigned long wrap_ticks) | 109 | static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) |
| 107 | { | 110 | { |
| 108 | mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks)); | ||
| 109 | update_sched_clock(); | 111 | update_sched_clock(); |
| 112 | hrtimer_forward_now(hrt, cd.wrap_kt); | ||
| 113 | return HRTIMER_RESTART; | ||
| 110 | } | 114 | } |
| 111 | 115 | ||
| 112 | void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) | 116 | void __init sched_clock_register(u64 (*read)(void), int bits, |
| 117 | unsigned long rate) | ||
| 113 | { | 118 | { |
| 114 | unsigned long r, w; | 119 | unsigned long r; |
| 115 | u64 res, wrap; | 120 | u64 res, wrap; |
| 116 | char r_unit; | 121 | char r_unit; |
| 117 | 122 | ||
| 118 | if (cd.rate > rate) | 123 | if (cd.rate > rate) |
| 119 | return; | 124 | return; |
| 120 | 125 | ||
| 121 | BUG_ON(bits > 32); | ||
| 122 | WARN_ON(!irqs_disabled()); | 126 | WARN_ON(!irqs_disabled()); |
| 123 | read_sched_clock = read; | 127 | read_sched_clock = read; |
| 124 | sched_clock_mask = (1ULL << bits) - 1; | 128 | sched_clock_mask = CLOCKSOURCE_MASK(bits); |
| 125 | cd.rate = rate; | 129 | cd.rate = rate; |
| 126 | 130 | ||
| 127 | /* calculate the mult/shift to convert counter ticks to ns. */ | 131 | /* calculate the mult/shift to convert counter ticks to ns. */ |
| 128 | clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); | 132 | clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600); |
| 129 | 133 | ||
| 130 | r = rate; | 134 | r = rate; |
| 131 | if (r >= 4000000) { | 135 | if (r >= 4000000) { |
| @@ -138,20 +142,14 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) | |||
| 138 | r_unit = ' '; | 142 | r_unit = ' '; |
| 139 | 143 | ||
| 140 | /* calculate how many ns until we wrap */ | 144 | /* calculate how many ns until we wrap */ |
| 141 | wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); | 145 | wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask); |
| 142 | do_div(wrap, NSEC_PER_MSEC); | 146 | cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); |
| 143 | w = wrap; | ||
| 144 | 147 | ||
| 145 | /* calculate the ns resolution of this counter */ | 148 | /* calculate the ns resolution of this counter */ |
| 146 | res = cyc_to_ns(1ULL, cd.mult, cd.shift); | 149 | res = cyc_to_ns(1ULL, cd.mult, cd.shift); |
| 147 | pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", | 150 | pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", |
| 148 | bits, r, r_unit, res, w); | 151 | bits, r, r_unit, res, wrap); |
| 149 | 152 | ||
| 150 | /* | ||
| 151 | * Start the timer to keep sched_clock() properly updated and | ||
| 152 | * sets the initial epoch. | ||
| 153 | */ | ||
| 154 | sched_clock_timer.data = msecs_to_jiffies(w - (w / 10)); | ||
| 155 | update_sched_clock(); | 153 | update_sched_clock(); |
| 156 | 154 | ||
| 157 | /* | 155 | /* |
| @@ -166,11 +164,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) | |||
| 166 | pr_debug("Registered %pF as sched_clock source\n", read); | 164 | pr_debug("Registered %pF as sched_clock source\n", read); |
| 167 | } | 165 | } |
| 168 | 166 | ||
| 169 | unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; | 167 | void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) |
| 170 | |||
| 171 | unsigned long long notrace sched_clock(void) | ||
| 172 | { | 168 | { |
| 173 | return sched_clock_func(); | 169 | read_sched_clock_32 = read; |
| 170 | sched_clock_register(read_sched_clock_32_wrapper, bits, rate); | ||
| 174 | } | 171 | } |
| 175 | 172 | ||
| 176 | void __init sched_clock_postinit(void) | 173 | void __init sched_clock_postinit(void) |
| @@ -180,14 +177,22 @@ void __init sched_clock_postinit(void) | |||
| 180 | * make it the final one one. | 177 | * make it the final one one. |
| 181 | */ | 178 | */ |
| 182 | if (read_sched_clock == jiffy_sched_clock_read) | 179 | if (read_sched_clock == jiffy_sched_clock_read) |
| 183 | setup_sched_clock(jiffy_sched_clock_read, 32, HZ); | 180 | sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); |
| 184 | 181 | ||
| 185 | sched_clock_poll(sched_clock_timer.data); | 182 | update_sched_clock(); |
| 183 | |||
| 184 | /* | ||
| 185 | * Start the timer to keep sched_clock() properly updated and | ||
| 186 | * sets the initial epoch. | ||
| 187 | */ | ||
| 188 | hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 189 | sched_clock_timer.function = sched_clock_poll; | ||
| 190 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); | ||
| 186 | } | 191 | } |
| 187 | 192 | ||
| 188 | static int sched_clock_suspend(void) | 193 | static int sched_clock_suspend(void) |
| 189 | { | 194 | { |
| 190 | sched_clock_poll(sched_clock_timer.data); | 195 | sched_clock_poll(&sched_clock_timer); |
| 191 | cd.suspended = true; | 196 | cd.suspended = true; |
| 192 | return 0; | 197 | return 0; |
| 193 | } | 198 | } |
| @@ -195,7 +200,6 @@ static int sched_clock_suspend(void) | |||
| 195 | static void sched_clock_resume(void) | 200 | static void sched_clock_resume(void) |
| 196 | { | 201 | { |
| 197 | cd.epoch_cyc = read_sched_clock(); | 202 | cd.epoch_cyc = read_sched_clock(); |
| 198 | cd.epoch_cyc_copy = cd.epoch_cyc; | ||
| 199 | cd.suspended = false; | 203 | cd.suspended = false; |
| 200 | } | 204 | } |
| 201 | 205 | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 218bcb565fed..9532690daaa9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
| @@ -70,6 +70,7 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev, | |||
| 70 | struct clock_event_device *newdev) | 70 | struct clock_event_device *newdev) |
| 71 | { | 71 | { |
| 72 | if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || | 72 | if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || |
| 73 | (newdev->features & CLOCK_EVT_FEAT_PERCPU) || | ||
| 73 | (newdev->features & CLOCK_EVT_FEAT_C3STOP)) | 74 | (newdev->features & CLOCK_EVT_FEAT_C3STOP)) |
| 74 | return false; | 75 | return false; |
| 75 | 76 | ||
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index bc906cad709b..18e71f7fbc2a 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
| @@ -31,7 +31,7 @@ extern void tick_install_replacement(struct clock_event_device *dev); | |||
| 31 | 31 | ||
| 32 | extern void clockevents_shutdown(struct clock_event_device *dev); | 32 | extern void clockevents_shutdown(struct clock_event_device *dev); |
| 33 | 33 | ||
| 34 | extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); | 34 | extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); |
| 35 | 35 | ||
| 36 | /* | 36 | /* |
| 37 | * NO_HZ / high resolution timer shared code | 37 | * NO_HZ / high resolution timer shared code |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 947ba25a95a0..3abf53418b67 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -1613,9 +1613,10 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | |||
| 1613 | * ktime_get_update_offsets - hrtimer helper | 1613 | * ktime_get_update_offsets - hrtimer helper |
| 1614 | * @offs_real: pointer to storage for monotonic -> realtime offset | 1614 | * @offs_real: pointer to storage for monotonic -> realtime offset |
| 1615 | * @offs_boot: pointer to storage for monotonic -> boottime offset | 1615 | * @offs_boot: pointer to storage for monotonic -> boottime offset |
| 1616 | * @offs_tai: pointer to storage for monotonic -> clock tai offset | ||
| 1616 | * | 1617 | * |
| 1617 | * Returns current monotonic time and updates the offsets | 1618 | * Returns current monotonic time and updates the offsets |
| 1618 | * Called from hrtimer_interupt() or retrigger_next_event() | 1619 | * Called from hrtimer_interrupt() or retrigger_next_event() |
| 1619 | */ | 1620 | */ |
| 1620 | ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, | 1621 | ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, |
| 1621 | ktime_t *offs_tai) | 1622 | ktime_t *offs_tai) |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 0b537f27b559..1fb08f21302e 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
| @@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v) | |||
| 298 | period = ktime_to_timespec(time); | 298 | period = ktime_to_timespec(time); |
| 299 | ms = period.tv_nsec / 1000000; | 299 | ms = period.tv_nsec / 1000000; |
| 300 | 300 | ||
| 301 | seq_puts(m, "Timer Stats Version: v0.2\n"); | 301 | seq_puts(m, "Timer Stats Version: v0.3\n"); |
| 302 | seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); | 302 | seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); |
| 303 | if (atomic_read(&overflow_count)) | 303 | if (atomic_read(&overflow_count)) |
| 304 | seq_printf(m, "Overflow: %d entries\n", | 304 | seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); |
| 305 | atomic_read(&overflow_count)); | 305 | seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); |
| 306 | 306 | ||
| 307 | for (i = 0; i < nr_entries; i++) { | 307 | for (i = 0; i < nr_entries; i++) { |
| 308 | entry = entries + i; | 308 | entry = entries + i; |
| 309 | if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { | 309 | if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { |
| 310 | seq_printf(m, "%4luD, %5d %-16s ", | 310 | seq_printf(m, "%4luD, %5d %-16s ", |
| 311 | entry->count, entry->pid, entry->comm); | 311 | entry->count, entry->pid, entry->comm); |
| 312 | } else { | 312 | } else { |
diff --git a/kernel/timer.c b/kernel/timer.c index 4296d13db3d1..6582b82fa966 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) | |||
| 1092 | static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | 1092 | static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), |
| 1093 | unsigned long data) | 1093 | unsigned long data) |
| 1094 | { | 1094 | { |
| 1095 | int preempt_count = preempt_count(); | 1095 | int count = preempt_count(); |
| 1096 | 1096 | ||
| 1097 | #ifdef CONFIG_LOCKDEP | 1097 | #ifdef CONFIG_LOCKDEP |
| 1098 | /* | 1098 | /* |
| @@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | |||
| 1119 | 1119 | ||
| 1120 | lock_map_release(&lockdep_map); | 1120 | lock_map_release(&lockdep_map); |
| 1121 | 1121 | ||
| 1122 | if (preempt_count != preempt_count()) { | 1122 | if (count != preempt_count()) { |
| 1123 | WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", | 1123 | WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", |
| 1124 | fn, preempt_count, preempt_count()); | 1124 | fn, count, preempt_count()); |
| 1125 | /* | 1125 | /* |
| 1126 | * Restore the preempt count. That gives us a decent | 1126 | * Restore the preempt count. That gives us a decent |
| 1127 | * chance to survive and extract information. If the | 1127 | * chance to survive and extract information. If the |
| 1128 | * callback kept a lock held, bad luck, but not worse | 1128 | * callback kept a lock held, bad luck, but not worse |
| 1129 | * than the BUG() we had. | 1129 | * than the BUG() we had. |
| 1130 | */ | 1130 | */ |
| 1131 | preempt_count() = preempt_count; | 1131 | preempt_count_set(count); |
| 1132 | } | 1132 | } |
| 1133 | } | 1133 | } |
| 1134 | 1134 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7974ba20557d..d9fea7dfd5d3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -1509,7 +1509,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, | |||
| 1509 | #endif | 1509 | #endif |
| 1510 | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | | 1510 | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | |
| 1511 | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | | 1511 | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | |
| 1512 | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); | 1512 | (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | |
| 1513 | (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); | ||
| 1513 | } | 1514 | } |
| 1514 | EXPORT_SYMBOL_GPL(tracing_generic_entry_update); | 1515 | EXPORT_SYMBOL_GPL(tracing_generic_entry_update); |
| 1515 | 1516 | ||
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 10c86fb7a2b4..73d08aa25b55 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -124,6 +124,7 @@ enum trace_flag_type { | |||
| 124 | TRACE_FLAG_NEED_RESCHED = 0x04, | 124 | TRACE_FLAG_NEED_RESCHED = 0x04, |
| 125 | TRACE_FLAG_HARDIRQ = 0x08, | 125 | TRACE_FLAG_HARDIRQ = 0x08, |
| 126 | TRACE_FLAG_SOFTIRQ = 0x10, | 126 | TRACE_FLAG_SOFTIRQ = 0x10, |
| 127 | TRACE_FLAG_PREEMPT_RESCHED = 0x20, | ||
| 127 | }; | 128 | }; |
| 128 | 129 | ||
| 129 | #define TRACE_BUF_SIZE 1024 | 130 | #define TRACE_BUF_SIZE 1024 |
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 80c36bcf66e8..78e27e3b52ac 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
| @@ -26,7 +26,7 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, | |||
| 26 | { | 26 | { |
| 27 | /* The ftrace function trace is allowed only for root. */ | 27 | /* The ftrace function trace is allowed only for root. */ |
| 28 | if (ftrace_event_is_function(tp_event) && | 28 | if (ftrace_event_is_function(tp_event) && |
| 29 | perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) | 29 | perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) |
| 30 | return -EPERM; | 30 | return -EPERM; |
| 31 | 31 | ||
| 32 | /* No tracing, just counting, so no obvious leak */ | 32 | /* No tracing, just counting, so no obvious leak */ |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 34e7cbac0c9c..ed32284fbe32 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
| @@ -618,8 +618,23 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | |||
| 618 | (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : | 618 | (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : |
| 619 | (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : | 619 | (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : |
| 620 | '.'; | 620 | '.'; |
| 621 | need_resched = | 621 | |
| 622 | (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; | 622 | switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | |
| 623 | TRACE_FLAG_PREEMPT_RESCHED)) { | ||
| 624 | case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: | ||
| 625 | need_resched = 'N'; | ||
| 626 | break; | ||
| 627 | case TRACE_FLAG_NEED_RESCHED: | ||
| 628 | need_resched = 'n'; | ||
| 629 | break; | ||
| 630 | case TRACE_FLAG_PREEMPT_RESCHED: | ||
| 631 | need_resched = 'p'; | ||
| 632 | break; | ||
| 633 | default: | ||
| 634 | need_resched = '.'; | ||
| 635 | break; | ||
| 636 | } | ||
| 637 | |||
| 623 | hardsoft_irq = | 638 | hardsoft_irq = |
| 624 | (hardirq && softirq) ? 'H' : | 639 | (hardirq && softirq) ? 'H' : |
| 625 | hardirq ? 'h' : | 640 | hardirq ? 'h' : |
