aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/fork.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/fork.c')
-rw-r--r--kernel/fork.c484
1 files changed, 267 insertions, 217 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 2295fc69717f..c7c112391d79 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -77,6 +77,7 @@
77#include <linux/blkdev.h> 77#include <linux/blkdev.h>
78#include <linux/fs_struct.h> 78#include <linux/fs_struct.h>
79#include <linux/magic.h> 79#include <linux/magic.h>
80#include <linux/sched/mm.h>
80#include <linux/perf_event.h> 81#include <linux/perf_event.h>
81#include <linux/posix-timers.h> 82#include <linux/posix-timers.h>
82#include <linux/user-return-notifier.h> 83#include <linux/user-return-notifier.h>
@@ -282,8 +283,9 @@ static void free_thread_stack(struct task_struct *tsk)
282 283
283void thread_stack_cache_init(void) 284void thread_stack_cache_init(void)
284{ 285{
285 thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE, 286 thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
286 THREAD_SIZE, 0, NULL); 287 THREAD_SIZE, THREAD_SIZE, 0, 0,
288 THREAD_SIZE, NULL);
287 BUG_ON(thread_stack_cache == NULL); 289 BUG_ON(thread_stack_cache == NULL);
288} 290}
289# endif 291# endif
@@ -390,6 +392,246 @@ void free_task(struct task_struct *tsk)
390} 392}
391EXPORT_SYMBOL(free_task); 393EXPORT_SYMBOL(free_task);
392 394
395#ifdef CONFIG_MMU
396static __latent_entropy int dup_mmap(struct mm_struct *mm,
397 struct mm_struct *oldmm)
398{
399 struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
400 struct rb_node **rb_link, *rb_parent;
401 int retval;
402 unsigned long charge;
403 LIST_HEAD(uf);
404
405 uprobe_start_dup_mmap();
406 if (down_write_killable(&oldmm->mmap_sem)) {
407 retval = -EINTR;
408 goto fail_uprobe_end;
409 }
410 flush_cache_dup_mm(oldmm);
411 uprobe_dup_mmap(oldmm, mm);
412 /*
413 * Not linked in yet - no deadlock potential:
414 */
415 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
416
417 /* No ordering required: file already has been exposed. */
418 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
419
420 mm->total_vm = oldmm->total_vm;
421 mm->data_vm = oldmm->data_vm;
422 mm->exec_vm = oldmm->exec_vm;
423 mm->stack_vm = oldmm->stack_vm;
424
425 rb_link = &mm->mm_rb.rb_node;
426 rb_parent = NULL;
427 pprev = &mm->mmap;
428 retval = ksm_fork(mm, oldmm);
429 if (retval)
430 goto out;
431 retval = khugepaged_fork(mm, oldmm);
432 if (retval)
433 goto out;
434
435 prev = NULL;
436 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
437 struct file *file;
438
439 if (mpnt->vm_flags & VM_DONTCOPY) {
440 vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
441 continue;
442 }
443 charge = 0;
444 if (mpnt->vm_flags & VM_ACCOUNT) {
445 unsigned long len = vma_pages(mpnt);
446
447 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
448 goto fail_nomem;
449 charge = len;
450 }
451 tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
452 if (!tmp)
453 goto fail_nomem;
454 *tmp = *mpnt;
455 INIT_LIST_HEAD(&tmp->anon_vma_chain);
456 retval = vma_dup_policy(mpnt, tmp);
457 if (retval)
458 goto fail_nomem_policy;
459 tmp->vm_mm = mm;
460 retval = dup_userfaultfd(tmp, &uf);
461 if (retval)
462 goto fail_nomem_anon_vma_fork;
463 if (tmp->vm_flags & VM_WIPEONFORK) {
464 /* VM_WIPEONFORK gets a clean slate in the child. */
465 tmp->anon_vma = NULL;
466 if (anon_vma_prepare(tmp))
467 goto fail_nomem_anon_vma_fork;
468 } else if (anon_vma_fork(tmp, mpnt))
469 goto fail_nomem_anon_vma_fork;
470 tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
471 tmp->vm_next = tmp->vm_prev = NULL;
472 file = tmp->vm_file;
473 if (file) {
474 struct inode *inode = file_inode(file);
475 struct address_space *mapping = file->f_mapping;
476
477 get_file(file);
478 if (tmp->vm_flags & VM_DENYWRITE)
479 atomic_dec(&inode->i_writecount);
480 i_mmap_lock_write(mapping);
481 if (tmp->vm_flags & VM_SHARED)
482 atomic_inc(&mapping->i_mmap_writable);
483 flush_dcache_mmap_lock(mapping);
484 /* insert tmp into the share list, just after mpnt */
485 vma_interval_tree_insert_after(tmp, mpnt,
486 &mapping->i_mmap);
487 flush_dcache_mmap_unlock(mapping);
488 i_mmap_unlock_write(mapping);
489 }
490
491 /*
492 * Clear hugetlb-related page reserves for children. This only
493 * affects MAP_PRIVATE mappings. Faults generated by the child
494 * are not guaranteed to succeed, even if read-only
495 */
496 if (is_vm_hugetlb_page(tmp))
497 reset_vma_resv_huge_pages(tmp);
498
499 /*
500 * Link in the new vma and copy the page table entries.
501 */
502 *pprev = tmp;
503 pprev = &tmp->vm_next;
504 tmp->vm_prev = prev;
505 prev = tmp;
506
507 __vma_link_rb(mm, tmp, rb_link, rb_parent);
508 rb_link = &tmp->vm_rb.rb_right;
509 rb_parent = &tmp->vm_rb;
510
511 mm->map_count++;
512 if (!(tmp->vm_flags & VM_WIPEONFORK))
513 retval = copy_page_range(mm, oldmm, mpnt);
514
515 if (tmp->vm_ops && tmp->vm_ops->open)
516 tmp->vm_ops->open(tmp);
517
518 if (retval)
519 goto out;
520 }
521 /* a new mm has just been created */
522 arch_dup_mmap(oldmm, mm);
523 retval = 0;
524out:
525 up_write(&mm->mmap_sem);
526 flush_tlb_mm(oldmm);
527 up_write(&oldmm->mmap_sem);
528 dup_userfaultfd_complete(&uf);
529fail_uprobe_end:
530 uprobe_end_dup_mmap();
531 return retval;
532fail_nomem_anon_vma_fork:
533 mpol_put(vma_policy(tmp));
534fail_nomem_policy:
535 kmem_cache_free(vm_area_cachep, tmp);
536fail_nomem:
537 retval = -ENOMEM;
538 vm_unacct_memory(charge);
539 goto out;
540}
541
542static inline int mm_alloc_pgd(struct mm_struct *mm)
543{
544 mm->pgd = pgd_alloc(mm);
545 if (unlikely(!mm->pgd))
546 return -ENOMEM;
547 return 0;
548}
549
550static inline void mm_free_pgd(struct mm_struct *mm)
551{
552 pgd_free(mm, mm->pgd);
553}
554#else
555static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
556{
557 down_write(&oldmm->mmap_sem);
558 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
559 up_write(&oldmm->mmap_sem);
560 return 0;
561}
562#define mm_alloc_pgd(mm) (0)
563#define mm_free_pgd(mm)
564#endif /* CONFIG_MMU */
565
566static void check_mm(struct mm_struct *mm)
567{
568 int i;
569
570 for (i = 0; i < NR_MM_COUNTERS; i++) {
571 long x = atomic_long_read(&mm->rss_stat.count[i]);
572
573 if (unlikely(x))
574 printk(KERN_ALERT "BUG: Bad rss-counter state "
575 "mm:%p idx:%d val:%ld\n", mm, i, x);
576 }
577
578 if (mm_pgtables_bytes(mm))
579 pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
580 mm_pgtables_bytes(mm));
581
582#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
583 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
584#endif
585}
586
587#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
588#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
589
590/*
591 * Called when the last reference to the mm
592 * is dropped: either by a lazy thread or by
593 * mmput. Free the page directory and the mm.
594 */
595static void __mmdrop(struct mm_struct *mm)
596{
597 BUG_ON(mm == &init_mm);
598 mm_free_pgd(mm);
599 destroy_context(mm);
600 hmm_mm_destroy(mm);
601 mmu_notifier_mm_destroy(mm);
602 check_mm(mm);
603 put_user_ns(mm->user_ns);
604 free_mm(mm);
605}
606
607void mmdrop(struct mm_struct *mm)
608{
609 /*
610 * The implicit full barrier implied by atomic_dec_and_test() is
611 * required by the membarrier system call before returning to
612 * user-space, after storing to rq->curr.
613 */
614 if (unlikely(atomic_dec_and_test(&mm->mm_count)))
615 __mmdrop(mm);
616}
617EXPORT_SYMBOL_GPL(mmdrop);
618
619static void mmdrop_async_fn(struct work_struct *work)
620{
621 struct mm_struct *mm;
622
623 mm = container_of(work, struct mm_struct, async_put_work);
624 __mmdrop(mm);
625}
626
627static void mmdrop_async(struct mm_struct *mm)
628{
629 if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
630 INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
631 schedule_work(&mm->async_put_work);
632 }
633}
634
393static inline void free_signal_struct(struct signal_struct *sig) 635static inline void free_signal_struct(struct signal_struct *sig)
394{ 636{
395 taskstats_tgid_free(sig); 637 taskstats_tgid_free(sig);
@@ -457,6 +699,21 @@ static void set_max_threads(unsigned int max_threads_suggested)
457int arch_task_struct_size __read_mostly; 699int arch_task_struct_size __read_mostly;
458#endif 700#endif
459 701
702static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
703{
704 /* Fetch thread_struct whitelist for the architecture. */
705 arch_thread_struct_whitelist(offset, size);
706
707 /*
708 * Handle zero-sized whitelist or empty thread_struct, otherwise
709 * adjust offset to position of thread_struct in task_struct.
710 */
711 if (unlikely(*size == 0))
712 *offset = 0;
713 else
714 *offset += offsetof(struct task_struct, thread);
715}
716
460void __init fork_init(void) 717void __init fork_init(void)
461{ 718{
462 int i; 719 int i;
@@ -465,11 +722,14 @@ void __init fork_init(void)
465#define ARCH_MIN_TASKALIGN 0 722#define ARCH_MIN_TASKALIGN 0
466#endif 723#endif
467 int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); 724 int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
725 unsigned long useroffset, usersize;
468 726
469 /* create a slab on which task_structs can be allocated */ 727 /* create a slab on which task_structs can be allocated */
470 task_struct_cachep = kmem_cache_create("task_struct", 728 task_struct_whitelist(&useroffset, &usersize);
729 task_struct_cachep = kmem_cache_create_usercopy("task_struct",
471 arch_task_struct_size, align, 730 arch_task_struct_size, align,
472 SLAB_PANIC|SLAB_ACCOUNT, NULL); 731 SLAB_PANIC|SLAB_ACCOUNT,
732 useroffset, usersize, NULL);
473#endif 733#endif
474 734
475 /* do the arch specific task caches init */ 735 /* do the arch specific task caches init */
@@ -594,181 +854,8 @@ free_tsk:
594 return NULL; 854 return NULL;
595} 855}
596 856
597#ifdef CONFIG_MMU
598static __latent_entropy int dup_mmap(struct mm_struct *mm,
599 struct mm_struct *oldmm)
600{
601 struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
602 struct rb_node **rb_link, *rb_parent;
603 int retval;
604 unsigned long charge;
605 LIST_HEAD(uf);
606
607 uprobe_start_dup_mmap();
608 if (down_write_killable(&oldmm->mmap_sem)) {
609 retval = -EINTR;
610 goto fail_uprobe_end;
611 }
612 flush_cache_dup_mm(oldmm);
613 uprobe_dup_mmap(oldmm, mm);
614 /*
615 * Not linked in yet - no deadlock potential:
616 */
617 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
618
619 /* No ordering required: file already has been exposed. */
620 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
621
622 mm->total_vm = oldmm->total_vm;
623 mm->data_vm = oldmm->data_vm;
624 mm->exec_vm = oldmm->exec_vm;
625 mm->stack_vm = oldmm->stack_vm;
626
627 rb_link = &mm->mm_rb.rb_node;
628 rb_parent = NULL;
629 pprev = &mm->mmap;
630 retval = ksm_fork(mm, oldmm);
631 if (retval)
632 goto out;
633 retval = khugepaged_fork(mm, oldmm);
634 if (retval)
635 goto out;
636
637 prev = NULL;
638 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
639 struct file *file;
640
641 if (mpnt->vm_flags & VM_DONTCOPY) {
642 vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
643 continue;
644 }
645 charge = 0;
646 if (mpnt->vm_flags & VM_ACCOUNT) {
647 unsigned long len = vma_pages(mpnt);
648
649 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
650 goto fail_nomem;
651 charge = len;
652 }
653 tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
654 if (!tmp)
655 goto fail_nomem;
656 *tmp = *mpnt;
657 INIT_LIST_HEAD(&tmp->anon_vma_chain);
658 retval = vma_dup_policy(mpnt, tmp);
659 if (retval)
660 goto fail_nomem_policy;
661 tmp->vm_mm = mm;
662 retval = dup_userfaultfd(tmp, &uf);
663 if (retval)
664 goto fail_nomem_anon_vma_fork;
665 if (tmp->vm_flags & VM_WIPEONFORK) {
666 /* VM_WIPEONFORK gets a clean slate in the child. */
667 tmp->anon_vma = NULL;
668 if (anon_vma_prepare(tmp))
669 goto fail_nomem_anon_vma_fork;
670 } else if (anon_vma_fork(tmp, mpnt))
671 goto fail_nomem_anon_vma_fork;
672 tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
673 tmp->vm_next = tmp->vm_prev = NULL;
674 file = tmp->vm_file;
675 if (file) {
676 struct inode *inode = file_inode(file);
677 struct address_space *mapping = file->f_mapping;
678
679 get_file(file);
680 if (tmp->vm_flags & VM_DENYWRITE)
681 atomic_dec(&inode->i_writecount);
682 i_mmap_lock_write(mapping);
683 if (tmp->vm_flags & VM_SHARED)
684 atomic_inc(&mapping->i_mmap_writable);
685 flush_dcache_mmap_lock(mapping);
686 /* insert tmp into the share list, just after mpnt */
687 vma_interval_tree_insert_after(tmp, mpnt,
688 &mapping->i_mmap);
689 flush_dcache_mmap_unlock(mapping);
690 i_mmap_unlock_write(mapping);
691 }
692
693 /*
694 * Clear hugetlb-related page reserves for children. This only
695 * affects MAP_PRIVATE mappings. Faults generated by the child
696 * are not guaranteed to succeed, even if read-only
697 */
698 if (is_vm_hugetlb_page(tmp))
699 reset_vma_resv_huge_pages(tmp);
700
701 /*
702 * Link in the new vma and copy the page table entries.
703 */
704 *pprev = tmp;
705 pprev = &tmp->vm_next;
706 tmp->vm_prev = prev;
707 prev = tmp;
708
709 __vma_link_rb(mm, tmp, rb_link, rb_parent);
710 rb_link = &tmp->vm_rb.rb_right;
711 rb_parent = &tmp->vm_rb;
712
713 mm->map_count++;
714 if (!(tmp->vm_flags & VM_WIPEONFORK))
715 retval = copy_page_range(mm, oldmm, mpnt);
716
717 if (tmp->vm_ops && tmp->vm_ops->open)
718 tmp->vm_ops->open(tmp);
719
720 if (retval)
721 goto out;
722 }
723 /* a new mm has just been created */
724 retval = arch_dup_mmap(oldmm, mm);
725out:
726 up_write(&mm->mmap_sem);
727 flush_tlb_mm(oldmm);
728 up_write(&oldmm->mmap_sem);
729 dup_userfaultfd_complete(&uf);
730fail_uprobe_end:
731 uprobe_end_dup_mmap();
732 return retval;
733fail_nomem_anon_vma_fork:
734 mpol_put(vma_policy(tmp));
735fail_nomem_policy:
736 kmem_cache_free(vm_area_cachep, tmp);
737fail_nomem:
738 retval = -ENOMEM;
739 vm_unacct_memory(charge);
740 goto out;
741}
742
743static inline int mm_alloc_pgd(struct mm_struct *mm)
744{
745 mm->pgd = pgd_alloc(mm);
746 if (unlikely(!mm->pgd))
747 return -ENOMEM;
748 return 0;
749}
750
751static inline void mm_free_pgd(struct mm_struct *mm)
752{
753 pgd_free(mm, mm->pgd);
754}
755#else
756static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
757{
758 down_write(&oldmm->mmap_sem);
759 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
760 up_write(&oldmm->mmap_sem);
761 return 0;
762}
763#define mm_alloc_pgd(mm) (0)
764#define mm_free_pgd(mm)
765#endif /* CONFIG_MMU */
766
767__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); 857__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
768 858
769#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
770#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
771
772static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; 859static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
773 860
774static int __init coredump_filter_setup(char *s) 861static int __init coredump_filter_setup(char *s)
@@ -858,27 +945,6 @@ fail_nopgd:
858 return NULL; 945 return NULL;
859} 946}
860 947
861static void check_mm(struct mm_struct *mm)
862{
863 int i;
864
865 for (i = 0; i < NR_MM_COUNTERS; i++) {
866 long x = atomic_long_read(&mm->rss_stat.count[i]);
867
868 if (unlikely(x))
869 printk(KERN_ALERT "BUG: Bad rss-counter state "
870 "mm:%p idx:%d val:%ld\n", mm, i, x);
871 }
872
873 if (mm_pgtables_bytes(mm))
874 pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
875 mm_pgtables_bytes(mm));
876
877#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
878 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
879#endif
880}
881
882/* 948/*
883 * Allocate and initialize an mm_struct. 949 * Allocate and initialize an mm_struct.
884 */ 950 */
@@ -894,24 +960,6 @@ struct mm_struct *mm_alloc(void)
894 return mm_init(mm, current, current_user_ns()); 960 return mm_init(mm, current, current_user_ns());
895} 961}
896 962
897/*
898 * Called when the last reference to the mm
899 * is dropped: either by a lazy thread or by
900 * mmput. Free the page directory and the mm.
901 */
902void __mmdrop(struct mm_struct *mm)
903{
904 BUG_ON(mm == &init_mm);
905 mm_free_pgd(mm);
906 destroy_context(mm);
907 hmm_mm_destroy(mm);
908 mmu_notifier_mm_destroy(mm);
909 check_mm(mm);
910 put_user_ns(mm->user_ns);
911 free_mm(mm);
912}
913EXPORT_SYMBOL_GPL(__mmdrop);
914
915static inline void __mmput(struct mm_struct *mm) 963static inline void __mmput(struct mm_struct *mm)
916{ 964{
917 VM_BUG_ON(atomic_read(&mm->mm_users)); 965 VM_BUG_ON(atomic_read(&mm->mm_users));
@@ -2224,9 +2272,11 @@ void __init proc_caches_init(void)
2224 * maximum number of CPU's we can ever have. The cpumask_allocation 2272 * maximum number of CPU's we can ever have. The cpumask_allocation
2225 * is at the end of the structure, exactly for that reason. 2273 * is at the end of the structure, exactly for that reason.
2226 */ 2274 */
2227 mm_cachep = kmem_cache_create("mm_struct", 2275 mm_cachep = kmem_cache_create_usercopy("mm_struct",
2228 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 2276 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
2229 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, 2277 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2278 offsetof(struct mm_struct, saved_auxv),
2279 sizeof_field(struct mm_struct, saved_auxv),
2230 NULL); 2280 NULL);
2231 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); 2281 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
2232 mmap_init(); 2282 mmap_init();