aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-10-03 19:13:28 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-10-03 19:13:28 -0400
commit1a4a2bc460721bc8f91e4c1294d39b38e5af132f (patch)
treefe646d05f6e17f05601e0a32cc796bec718ab6e7 /kernel
parent110a9e42b68719f584879c5c5c727bbae90d15f9 (diff)
parent1ef55be16ed69538f89e0a6508be5e62fdc9851c (diff)
Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull low-level x86 updates from Ingo Molnar: "In this cycle this topic tree has become one of those 'super topics' that accumulated a lot of changes: - Add CONFIG_VMAP_STACK=y support to the core kernel and enable it on x86 - preceded by an array of changes. v4.8 saw preparatory changes in this area already - this is the rest of the work. Includes the thread stack caching performance optimization. (Andy Lutomirski) - switch_to() cleanups and all around enhancements. (Brian Gerst) - A large number of dumpstack infrastructure enhancements and an unwinder abstraction. The secret long term plan is safe(r) live patching plus maybe another attempt at debuginfo based unwinding - but all these current bits are standalone enhancements in a frame pointer based debug environment as well. (Josh Poimboeuf) - More __ro_after_init and const annotations. (Kees Cook) - Enable KASLR for the vmemmap memory region. (Thomas Garnier)" [ The virtually mapped stack changes are pretty fundamental, and not x86-specific per se, even if they are only used on x86 right now. ] * 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (70 commits) x86/asm: Get rid of __read_cr4_safe() thread_info: Use unsigned long for flags x86/alternatives: Add stack frame dependency to alternative_call_2() x86/dumpstack: Fix show_stack() task pointer regression x86/dumpstack: Remove dump_trace() and related callbacks x86/dumpstack: Convert show_trace_log_lvl() to use the new unwinder oprofile/x86: Convert x86_backtrace() to use the new unwinder x86/stacktrace: Convert save_stack_trace_*() to use the new unwinder perf/x86: Convert perf_callchain_kernel() to use the new unwinder x86/unwind: Add new unwind interface and implementations x86/dumpstack: Remove NULL task pointer convention fork: Optimize task creation by caching two thread stacks per CPU if CONFIG_VMAP_STACK=y sched/core: Free the stack early if CONFIG_THREAD_INFO_IN_TASK lib/syscall: Pin the task stack in collect_syscall() x86/process: Pin the target stack in get_wchan() x86/dumpstack: Pin the target stack when dumping it kthread: Pin the stack via try_get_task_stack()/put_task_stack() in to_live_kthread() function sched/core: Add try_get_task_stack() and put_task_stack() x86/entry/64: Fix a minor comment rebase error iommu/amd: Don't put completion-wait semaphore on stack ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/fork.c175
-rw-r--r--kernel/kthread.c8
-rw-r--r--kernel/sched/core.c5
-rw-r--r--kernel/sched/sched.h4
-rw-r--r--kernel/trace/Kconfig5
-rw-r--r--kernel/trace/trace_functions_graph.c67
6 files changed, 233 insertions, 31 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index beb31725f7e2..c060c7e7c247 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -158,19 +158,83 @@ void __weak arch_release_thread_stack(unsigned long *stack)
158 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a 158 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
159 * kmemcache based allocator. 159 * kmemcache based allocator.
160 */ 160 */
161# if THREAD_SIZE >= PAGE_SIZE 161# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
162static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, 162
163 int node) 163#ifdef CONFIG_VMAP_STACK
164/*
165 * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
166 * flush. Try to minimize the number of calls by caching stacks.
167 */
168#define NR_CACHED_STACKS 2
169static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
170#endif
171
172static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
164{ 173{
174#ifdef CONFIG_VMAP_STACK
175 void *stack;
176 int i;
177
178 local_irq_disable();
179 for (i = 0; i < NR_CACHED_STACKS; i++) {
180 struct vm_struct *s = this_cpu_read(cached_stacks[i]);
181
182 if (!s)
183 continue;
184 this_cpu_write(cached_stacks[i], NULL);
185
186 tsk->stack_vm_area = s;
187 local_irq_enable();
188 return s->addr;
189 }
190 local_irq_enable();
191
192 stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
193 VMALLOC_START, VMALLOC_END,
194 THREADINFO_GFP | __GFP_HIGHMEM,
195 PAGE_KERNEL,
196 0, node, __builtin_return_address(0));
197
198 /*
199 * We can't call find_vm_area() in interrupt context, and
200 * free_thread_stack() can be called in interrupt context,
201 * so cache the vm_struct.
202 */
203 if (stack)
204 tsk->stack_vm_area = find_vm_area(stack);
205 return stack;
206#else
165 struct page *page = alloc_pages_node(node, THREADINFO_GFP, 207 struct page *page = alloc_pages_node(node, THREADINFO_GFP,
166 THREAD_SIZE_ORDER); 208 THREAD_SIZE_ORDER);
167 209
168 return page ? page_address(page) : NULL; 210 return page ? page_address(page) : NULL;
211#endif
169} 212}
170 213
171static inline void free_thread_stack(unsigned long *stack) 214static inline void free_thread_stack(struct task_struct *tsk)
172{ 215{
173 __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); 216#ifdef CONFIG_VMAP_STACK
217 if (task_stack_vm_area(tsk)) {
218 unsigned long flags;
219 int i;
220
221 local_irq_save(flags);
222 for (i = 0; i < NR_CACHED_STACKS; i++) {
223 if (this_cpu_read(cached_stacks[i]))
224 continue;
225
226 this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
227 local_irq_restore(flags);
228 return;
229 }
230 local_irq_restore(flags);
231
232 vfree(tsk->stack);
233 return;
234 }
235#endif
236
237 __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
174} 238}
175# else 239# else
176static struct kmem_cache *thread_stack_cache; 240static struct kmem_cache *thread_stack_cache;
@@ -181,9 +245,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
181 return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); 245 return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
182} 246}
183 247
184static void free_thread_stack(unsigned long *stack) 248static void free_thread_stack(struct task_struct *tsk)
185{ 249{
186 kmem_cache_free(thread_stack_cache, stack); 250 kmem_cache_free(thread_stack_cache, tsk->stack);
187} 251}
188 252
189void thread_stack_cache_init(void) 253void thread_stack_cache_init(void)
@@ -213,24 +277,76 @@ struct kmem_cache *vm_area_cachep;
213/* SLAB cache for mm_struct structures (tsk->mm) */ 277/* SLAB cache for mm_struct structures (tsk->mm) */
214static struct kmem_cache *mm_cachep; 278static struct kmem_cache *mm_cachep;
215 279
216static void account_kernel_stack(unsigned long *stack, int account) 280static void account_kernel_stack(struct task_struct *tsk, int account)
217{ 281{
218 /* All stack pages are in the same zone and belong to the same memcg. */ 282 void *stack = task_stack_page(tsk);
219 struct page *first_page = virt_to_page(stack); 283 struct vm_struct *vm = task_stack_vm_area(tsk);
284
285 BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
286
287 if (vm) {
288 int i;
220 289
221 mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, 290 BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
222 THREAD_SIZE / 1024 * account);
223 291
224 memcg_kmem_update_page_stat( 292 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
225 first_page, MEMCG_KERNEL_STACK_KB, 293 mod_zone_page_state(page_zone(vm->pages[i]),
226 account * (THREAD_SIZE / 1024)); 294 NR_KERNEL_STACK_KB,
295 PAGE_SIZE / 1024 * account);
296 }
297
298 /* All stack pages belong to the same memcg. */
299 memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
300 account * (THREAD_SIZE / 1024));
301 } else {
302 /*
303 * All stack pages are in the same zone and belong to the
304 * same memcg.
305 */
306 struct page *first_page = virt_to_page(stack);
307
308 mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
309 THREAD_SIZE / 1024 * account);
310
311 memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
312 account * (THREAD_SIZE / 1024));
313 }
227} 314}
228 315
229void free_task(struct task_struct *tsk) 316static void release_task_stack(struct task_struct *tsk)
230{ 317{
231 account_kernel_stack(tsk->stack, -1); 318 account_kernel_stack(tsk, -1);
232 arch_release_thread_stack(tsk->stack); 319 arch_release_thread_stack(tsk->stack);
233 free_thread_stack(tsk->stack); 320 free_thread_stack(tsk);
321 tsk->stack = NULL;
322#ifdef CONFIG_VMAP_STACK
323 tsk->stack_vm_area = NULL;
324#endif
325}
326
327#ifdef CONFIG_THREAD_INFO_IN_TASK
328void put_task_stack(struct task_struct *tsk)
329{
330 if (atomic_dec_and_test(&tsk->stack_refcount))
331 release_task_stack(tsk);
332}
333#endif
334
335void free_task(struct task_struct *tsk)
336{
337#ifndef CONFIG_THREAD_INFO_IN_TASK
338 /*
339 * The task is finally done with both the stack and thread_info,
340 * so free both.
341 */
342 release_task_stack(tsk);
343#else
344 /*
345 * If the task had a separate stack allocation, it should be gone
346 * by now.
347 */
348 WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
349#endif
234 rt_mutex_debug_task_free(tsk); 350 rt_mutex_debug_task_free(tsk);
235 ftrace_graph_exit_task(tsk); 351 ftrace_graph_exit_task(tsk);
236 put_seccomp_filter(tsk); 352 put_seccomp_filter(tsk);
@@ -342,6 +458,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
342{ 458{
343 struct task_struct *tsk; 459 struct task_struct *tsk;
344 unsigned long *stack; 460 unsigned long *stack;
461 struct vm_struct *stack_vm_area;
345 int err; 462 int err;
346 463
347 if (node == NUMA_NO_NODE) 464 if (node == NUMA_NO_NODE)
@@ -354,11 +471,26 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
354 if (!stack) 471 if (!stack)
355 goto free_tsk; 472 goto free_tsk;
356 473
474 stack_vm_area = task_stack_vm_area(tsk);
475
357 err = arch_dup_task_struct(tsk, orig); 476 err = arch_dup_task_struct(tsk, orig);
477
478 /*
479 * arch_dup_task_struct() clobbers the stack-related fields. Make
480 * sure they're properly initialized before using any stack-related
481 * functions again.
482 */
483 tsk->stack = stack;
484#ifdef CONFIG_VMAP_STACK
485 tsk->stack_vm_area = stack_vm_area;
486#endif
487#ifdef CONFIG_THREAD_INFO_IN_TASK
488 atomic_set(&tsk->stack_refcount, 1);
489#endif
490
358 if (err) 491 if (err)
359 goto free_stack; 492 goto free_stack;
360 493
361 tsk->stack = stack;
362#ifdef CONFIG_SECCOMP 494#ifdef CONFIG_SECCOMP
363 /* 495 /*
364 * We must handle setting up seccomp filters once we're under 496 * We must handle setting up seccomp filters once we're under
@@ -390,14 +522,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
390 tsk->task_frag.page = NULL; 522 tsk->task_frag.page = NULL;
391 tsk->wake_q.next = NULL; 523 tsk->wake_q.next = NULL;
392 524
393 account_kernel_stack(stack, 1); 525 account_kernel_stack(tsk, 1);
394 526
395 kcov_task_init(tsk); 527 kcov_task_init(tsk);
396 528
397 return tsk; 529 return tsk;
398 530
399free_stack: 531free_stack:
400 free_thread_stack(stack); 532 free_thread_stack(tsk);
401free_tsk: 533free_tsk:
402 free_task_struct(tsk); 534 free_task_struct(tsk);
403 return NULL; 535 return NULL;
@@ -1715,6 +1847,7 @@ bad_fork_cleanup_count:
1715 atomic_dec(&p->cred->user->processes); 1847 atomic_dec(&p->cred->user->processes);
1716 exit_creds(p); 1848 exit_creds(p);
1717bad_fork_free: 1849bad_fork_free:
1850 put_task_stack(p);
1718 free_task(p); 1851 free_task(p);
1719fork_out: 1852fork_out:
1720 return ERR_PTR(retval); 1853 return ERR_PTR(retval);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9ff173dca1ae..4ab4c3766a80 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -64,7 +64,7 @@ static inline struct kthread *to_kthread(struct task_struct *k)
64static struct kthread *to_live_kthread(struct task_struct *k) 64static struct kthread *to_live_kthread(struct task_struct *k)
65{ 65{
66 struct completion *vfork = ACCESS_ONCE(k->vfork_done); 66 struct completion *vfork = ACCESS_ONCE(k->vfork_done);
67 if (likely(vfork)) 67 if (likely(vfork) && try_get_task_stack(k))
68 return __to_kthread(vfork); 68 return __to_kthread(vfork);
69 return NULL; 69 return NULL;
70} 70}
@@ -425,8 +425,10 @@ void kthread_unpark(struct task_struct *k)
425{ 425{
426 struct kthread *kthread = to_live_kthread(k); 426 struct kthread *kthread = to_live_kthread(k);
427 427
428 if (kthread) 428 if (kthread) {
429 __kthread_unpark(k, kthread); 429 __kthread_unpark(k, kthread);
430 put_task_stack(k);
431 }
430} 432}
431EXPORT_SYMBOL_GPL(kthread_unpark); 433EXPORT_SYMBOL_GPL(kthread_unpark);
432 434
@@ -455,6 +457,7 @@ int kthread_park(struct task_struct *k)
455 wait_for_completion(&kthread->parked); 457 wait_for_completion(&kthread->parked);
456 } 458 }
457 } 459 }
460 put_task_stack(k);
458 ret = 0; 461 ret = 0;
459 } 462 }
460 return ret; 463 return ret;
@@ -490,6 +493,7 @@ int kthread_stop(struct task_struct *k)
490 __kthread_unpark(k, kthread); 493 __kthread_unpark(k, kthread);
491 wake_up_process(k); 494 wake_up_process(k);
492 wait_for_completion(&kthread->exited); 495 wait_for_completion(&kthread->exited);
496 put_task_stack(k);
493 } 497 }
494 ret = k->exit_code; 498 ret = k->exit_code;
495 put_task_struct(k); 499 put_task_struct(k);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fac6492f0b98..94732d1ab00a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2781,6 +2781,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
2781 * task and put them back on the free list. 2781 * task and put them back on the free list.
2782 */ 2782 */
2783 kprobe_flush_task(prev); 2783 kprobe_flush_task(prev);
2784
2785 /* Task is done with its stack. */
2786 put_task_stack(prev);
2787
2784 put_task_struct(prev); 2788 put_task_struct(prev);
2785 } 2789 }
2786 2790
@@ -3403,7 +3407,6 @@ static void __sched notrace __schedule(bool preempt)
3403 3407
3404 balance_callback(rq); 3408 balance_callback(rq);
3405} 3409}
3406STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
3407 3410
3408void __noreturn do_task_dead(void) 3411void __noreturn do_task_dead(void)
3409{ 3412{
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 58df5590d028..055f935d4421 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1021,7 +1021,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1021 * per-task data have been completed by this moment. 1021 * per-task data have been completed by this moment.
1022 */ 1022 */
1023 smp_wmb(); 1023 smp_wmb();
1024#ifdef CONFIG_THREAD_INFO_IN_TASK
1025 p->cpu = cpu;
1026#else
1024 task_thread_info(p)->cpu = cpu; 1027 task_thread_info(p)->cpu = cpu;
1028#endif
1025 p->wake_cpu = cpu; 1029 p->wake_cpu = cpu;
1026#endif 1030#endif
1027} 1031}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f4b86e8ca1e7..ba3326785ca4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -24,11 +24,6 @@ config HAVE_FUNCTION_GRAPH_TRACER
24 help 24 help
25 See Documentation/trace/ftrace-design.txt 25 See Documentation/trace/ftrace-design.txt
26 26
27config HAVE_FUNCTION_GRAPH_FP_TEST
28 bool
29 help
30 See Documentation/trace/ftrace-design.txt
31
32config HAVE_DYNAMIC_FTRACE 27config HAVE_DYNAMIC_FTRACE
33 bool 28 bool
34 help 29 help
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 7363ccf79512..0cbe38a844fa 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
119/* Add a function return address to the trace stack on thread info.*/ 119/* Add a function return address to the trace stack on thread info.*/
120int 120int
121ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, 121ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
122 unsigned long frame_pointer) 122 unsigned long frame_pointer, unsigned long *retp)
123{ 123{
124 unsigned long long calltime; 124 unsigned long long calltime;
125 int index; 125 int index;
@@ -171,7 +171,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
171 current->ret_stack[index].func = func; 171 current->ret_stack[index].func = func;
172 current->ret_stack[index].calltime = calltime; 172 current->ret_stack[index].calltime = calltime;
173 current->ret_stack[index].subtime = 0; 173 current->ret_stack[index].subtime = 0;
174#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
174 current->ret_stack[index].fp = frame_pointer; 175 current->ret_stack[index].fp = frame_pointer;
176#endif
177#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
178 current->ret_stack[index].retp = retp;
179#endif
175 *depth = current->curr_ret_stack; 180 *depth = current->curr_ret_stack;
176 181
177 return 0; 182 return 0;
@@ -204,7 +209,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
204 return; 209 return;
205 } 210 }
206 211
207#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) 212#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
208 /* 213 /*
209 * The arch may choose to record the frame pointer used 214 * The arch may choose to record the frame pointer used
210 * and check it here to make sure that it is what we expect it 215 * and check it here to make sure that it is what we expect it
@@ -279,6 +284,64 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
279 return ret; 284 return ret;
280} 285}
281 286
287/**
288 * ftrace_graph_ret_addr - convert a potentially modified stack return address
289 * to its original value
290 *
291 * This function can be called by stack unwinding code to convert a found stack
292 * return address ('ret') to its original value, in case the function graph
293 * tracer has modified it to be 'return_to_handler'. If the address hasn't
294 * been modified, the unchanged value of 'ret' is returned.
295 *
296 * 'idx' is a state variable which should be initialized by the caller to zero
297 * before the first call.
298 *
299 * 'retp' is a pointer to the return address on the stack. It's ignored if
300 * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined.
301 */
302#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
303unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
304 unsigned long ret, unsigned long *retp)
305{
306 int index = task->curr_ret_stack;
307 int i;
308
309 if (ret != (unsigned long)return_to_handler)
310 return ret;
311
312 if (index < -1)
313 index += FTRACE_NOTRACE_DEPTH;
314
315 if (index < 0)
316 return ret;
317
318 for (i = 0; i <= index; i++)
319 if (task->ret_stack[i].retp == retp)
320 return task->ret_stack[i].ret;
321
322 return ret;
323}
324#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
325unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
326 unsigned long ret, unsigned long *retp)
327{
328 int task_idx;
329
330 if (ret != (unsigned long)return_to_handler)
331 return ret;
332
333 task_idx = task->curr_ret_stack;
334
335 if (!task->ret_stack || task_idx < *idx)
336 return ret;
337
338 task_idx -= *idx;
339 (*idx)++;
340
341 return task->ret_stack[task_idx].ret;
342}
343#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
344
282int __trace_graph_entry(struct trace_array *tr, 345int __trace_graph_entry(struct trace_array *tr,
283 struct ftrace_graph_ent *trace, 346 struct ftrace_graph_ent *trace,
284 unsigned long flags, 347 unsigned long flags,