diff options
Diffstat (limited to 'kernel/fork.c')
-rw-r--r-- | kernel/fork.c | 175 |
1 files changed, 154 insertions, 21 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index beb31725f7e2..c060c7e7c247 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -158,19 +158,83 @@ void __weak arch_release_thread_stack(unsigned long *stack) | |||
158 | * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a | 158 | * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a |
159 | * kmemcache based allocator. | 159 | * kmemcache based allocator. |
160 | */ | 160 | */ |
161 | # if THREAD_SIZE >= PAGE_SIZE | 161 | # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) |
162 | static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, | 162 | |
163 | int node) | 163 | #ifdef CONFIG_VMAP_STACK |
164 | /* | ||
165 | * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB | ||
166 | * flush. Try to minimize the number of calls by caching stacks. | ||
167 | */ | ||
168 | #define NR_CACHED_STACKS 2 | ||
169 | static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); | ||
170 | #endif | ||
171 | |||
172 | static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) | ||
164 | { | 173 | { |
174 | #ifdef CONFIG_VMAP_STACK | ||
175 | void *stack; | ||
176 | int i; | ||
177 | |||
178 | local_irq_disable(); | ||
179 | for (i = 0; i < NR_CACHED_STACKS; i++) { | ||
180 | struct vm_struct *s = this_cpu_read(cached_stacks[i]); | ||
181 | |||
182 | if (!s) | ||
183 | continue; | ||
184 | this_cpu_write(cached_stacks[i], NULL); | ||
185 | |||
186 | tsk->stack_vm_area = s; | ||
187 | local_irq_enable(); | ||
188 | return s->addr; | ||
189 | } | ||
190 | local_irq_enable(); | ||
191 | |||
192 | stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, | ||
193 | VMALLOC_START, VMALLOC_END, | ||
194 | THREADINFO_GFP | __GFP_HIGHMEM, | ||
195 | PAGE_KERNEL, | ||
196 | 0, node, __builtin_return_address(0)); | ||
197 | |||
198 | /* | ||
199 | * We can't call find_vm_area() in interrupt context, and | ||
200 | * free_thread_stack() can be called in interrupt context, | ||
201 | * so cache the vm_struct. | ||
202 | */ | ||
203 | if (stack) | ||
204 | tsk->stack_vm_area = find_vm_area(stack); | ||
205 | return stack; | ||
206 | #else | ||
165 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, | 207 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, |
166 | THREAD_SIZE_ORDER); | 208 | THREAD_SIZE_ORDER); |
167 | 209 | ||
168 | return page ? page_address(page) : NULL; | 210 | return page ? page_address(page) : NULL; |
211 | #endif | ||
169 | } | 212 | } |
170 | 213 | ||
171 | static inline void free_thread_stack(unsigned long *stack) | 214 | static inline void free_thread_stack(struct task_struct *tsk) |
172 | { | 215 | { |
173 | __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); | 216 | #ifdef CONFIG_VMAP_STACK |
217 | if (task_stack_vm_area(tsk)) { | ||
218 | unsigned long flags; | ||
219 | int i; | ||
220 | |||
221 | local_irq_save(flags); | ||
222 | for (i = 0; i < NR_CACHED_STACKS; i++) { | ||
223 | if (this_cpu_read(cached_stacks[i])) | ||
224 | continue; | ||
225 | |||
226 | this_cpu_write(cached_stacks[i], tsk->stack_vm_area); | ||
227 | local_irq_restore(flags); | ||
228 | return; | ||
229 | } | ||
230 | local_irq_restore(flags); | ||
231 | |||
232 | vfree(tsk->stack); | ||
233 | return; | ||
234 | } | ||
235 | #endif | ||
236 | |||
237 | __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); | ||
174 | } | 238 | } |
175 | # else | 239 | # else |
176 | static struct kmem_cache *thread_stack_cache; | 240 | static struct kmem_cache *thread_stack_cache; |
@@ -181,9 +245,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, | |||
181 | return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); | 245 | return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); |
182 | } | 246 | } |
183 | 247 | ||
184 | static void free_thread_stack(unsigned long *stack) | 248 | static void free_thread_stack(struct task_struct *tsk) |
185 | { | 249 | { |
186 | kmem_cache_free(thread_stack_cache, stack); | 250 | kmem_cache_free(thread_stack_cache, tsk->stack); |
187 | } | 251 | } |
188 | 252 | ||
189 | void thread_stack_cache_init(void) | 253 | void thread_stack_cache_init(void) |
@@ -213,24 +277,76 @@ struct kmem_cache *vm_area_cachep; | |||
213 | /* SLAB cache for mm_struct structures (tsk->mm) */ | 277 | /* SLAB cache for mm_struct structures (tsk->mm) */ |
214 | static struct kmem_cache *mm_cachep; | 278 | static struct kmem_cache *mm_cachep; |
215 | 279 | ||
216 | static void account_kernel_stack(unsigned long *stack, int account) | 280 | static void account_kernel_stack(struct task_struct *tsk, int account) |
217 | { | 281 | { |
218 | /* All stack pages are in the same zone and belong to the same memcg. */ | 282 | void *stack = task_stack_page(tsk); |
219 | struct page *first_page = virt_to_page(stack); | 283 | struct vm_struct *vm = task_stack_vm_area(tsk); |
284 | |||
285 | BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); | ||
286 | |||
287 | if (vm) { | ||
288 | int i; | ||
220 | 289 | ||
221 | mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, | 290 | BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); |
222 | THREAD_SIZE / 1024 * account); | ||
223 | 291 | ||
224 | memcg_kmem_update_page_stat( | 292 | for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { |
225 | first_page, MEMCG_KERNEL_STACK_KB, | 293 | mod_zone_page_state(page_zone(vm->pages[i]), |
226 | account * (THREAD_SIZE / 1024)); | 294 | NR_KERNEL_STACK_KB, |
295 | PAGE_SIZE / 1024 * account); | ||
296 | } | ||
297 | |||
298 | /* All stack pages belong to the same memcg. */ | ||
299 | memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, | ||
300 | account * (THREAD_SIZE / 1024)); | ||
301 | } else { | ||
302 | /* | ||
303 | * All stack pages are in the same zone and belong to the | ||
304 | * same memcg. | ||
305 | */ | ||
306 | struct page *first_page = virt_to_page(stack); | ||
307 | |||
308 | mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, | ||
309 | THREAD_SIZE / 1024 * account); | ||
310 | |||
311 | memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, | ||
312 | account * (THREAD_SIZE / 1024)); | ||
313 | } | ||
227 | } | 314 | } |
228 | 315 | ||
229 | void free_task(struct task_struct *tsk) | 316 | static void release_task_stack(struct task_struct *tsk) |
230 | { | 317 | { |
231 | account_kernel_stack(tsk->stack, -1); | 318 | account_kernel_stack(tsk, -1); |
232 | arch_release_thread_stack(tsk->stack); | 319 | arch_release_thread_stack(tsk->stack); |
233 | free_thread_stack(tsk->stack); | 320 | free_thread_stack(tsk); |
321 | tsk->stack = NULL; | ||
322 | #ifdef CONFIG_VMAP_STACK | ||
323 | tsk->stack_vm_area = NULL; | ||
324 | #endif | ||
325 | } | ||
326 | |||
327 | #ifdef CONFIG_THREAD_INFO_IN_TASK | ||
328 | void put_task_stack(struct task_struct *tsk) | ||
329 | { | ||
330 | if (atomic_dec_and_test(&tsk->stack_refcount)) | ||
331 | release_task_stack(tsk); | ||
332 | } | ||
333 | #endif | ||
334 | |||
335 | void free_task(struct task_struct *tsk) | ||
336 | { | ||
337 | #ifndef CONFIG_THREAD_INFO_IN_TASK | ||
338 | /* | ||
339 | * The task is finally done with both the stack and thread_info, | ||
340 | * so free both. | ||
341 | */ | ||
342 | release_task_stack(tsk); | ||
343 | #else | ||
344 | /* | ||
345 | * If the task had a separate stack allocation, it should be gone | ||
346 | * by now. | ||
347 | */ | ||
348 | WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); | ||
349 | #endif | ||
234 | rt_mutex_debug_task_free(tsk); | 350 | rt_mutex_debug_task_free(tsk); |
235 | ftrace_graph_exit_task(tsk); | 351 | ftrace_graph_exit_task(tsk); |
236 | put_seccomp_filter(tsk); | 352 | put_seccomp_filter(tsk); |
@@ -342,6 +458,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
342 | { | 458 | { |
343 | struct task_struct *tsk; | 459 | struct task_struct *tsk; |
344 | unsigned long *stack; | 460 | unsigned long *stack; |
461 | struct vm_struct *stack_vm_area; | ||
345 | int err; | 462 | int err; |
346 | 463 | ||
347 | if (node == NUMA_NO_NODE) | 464 | if (node == NUMA_NO_NODE) |
@@ -354,11 +471,26 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
354 | if (!stack) | 471 | if (!stack) |
355 | goto free_tsk; | 472 | goto free_tsk; |
356 | 473 | ||
474 | stack_vm_area = task_stack_vm_area(tsk); | ||
475 | |||
357 | err = arch_dup_task_struct(tsk, orig); | 476 | err = arch_dup_task_struct(tsk, orig); |
477 | |||
478 | /* | ||
479 | * arch_dup_task_struct() clobbers the stack-related fields. Make | ||
480 | * sure they're properly initialized before using any stack-related | ||
481 | * functions again. | ||
482 | */ | ||
483 | tsk->stack = stack; | ||
484 | #ifdef CONFIG_VMAP_STACK | ||
485 | tsk->stack_vm_area = stack_vm_area; | ||
486 | #endif | ||
487 | #ifdef CONFIG_THREAD_INFO_IN_TASK | ||
488 | atomic_set(&tsk->stack_refcount, 1); | ||
489 | #endif | ||
490 | |||
358 | if (err) | 491 | if (err) |
359 | goto free_stack; | 492 | goto free_stack; |
360 | 493 | ||
361 | tsk->stack = stack; | ||
362 | #ifdef CONFIG_SECCOMP | 494 | #ifdef CONFIG_SECCOMP |
363 | /* | 495 | /* |
364 | * We must handle setting up seccomp filters once we're under | 496 | * We must handle setting up seccomp filters once we're under |
@@ -390,14 +522,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
390 | tsk->task_frag.page = NULL; | 522 | tsk->task_frag.page = NULL; |
391 | tsk->wake_q.next = NULL; | 523 | tsk->wake_q.next = NULL; |
392 | 524 | ||
393 | account_kernel_stack(stack, 1); | 525 | account_kernel_stack(tsk, 1); |
394 | 526 | ||
395 | kcov_task_init(tsk); | 527 | kcov_task_init(tsk); |
396 | 528 | ||
397 | return tsk; | 529 | return tsk; |
398 | 530 | ||
399 | free_stack: | 531 | free_stack: |
400 | free_thread_stack(stack); | 532 | free_thread_stack(tsk); |
401 | free_tsk: | 533 | free_tsk: |
402 | free_task_struct(tsk); | 534 | free_task_struct(tsk); |
403 | return NULL; | 535 | return NULL; |
@@ -1715,6 +1847,7 @@ bad_fork_cleanup_count: | |||
1715 | atomic_dec(&p->cred->user->processes); | 1847 | atomic_dec(&p->cred->user->processes); |
1716 | exit_creds(p); | 1848 | exit_creds(p); |
1717 | bad_fork_free: | 1849 | bad_fork_free: |
1850 | put_task_stack(p); | ||
1718 | free_task(p); | 1851 | free_task(p); |
1719 | fork_out: | 1852 | fork_out: |
1720 | return ERR_PTR(retval); | 1853 | return ERR_PTR(retval); |