aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Hansen <dave.hansen@linux.intel.com>2014-11-14 10:18:31 -0500
committerThomas Gleixner <tglx@linutronix.de>2014-11-17 18:58:54 -0500
commit1de4fa14ee25a8edf287855513b61c3945c8878a (patch)
treeb89755c77a75d95a234487d30661c03e01b2b538
parentfe3d197f84319d3bce379a9c0dc17b1f48ad358c (diff)
x86, mpx: Cleanup unused bound tables
The previous patch allocates bounds tables on-demand. As noted in an earlier description, these can add up to *HUGE* amounts of memory. This has caused OOMs in practice when running tests. This patch adds support for freeing bounds tables when they are no longer in use. There are two types of mappings in play when unmapping tables: 1. The mapping with the actual data, which userspace is munmap()ing or brk()ing away, etc... 2. The mapping for the bounds table *backing* the data (is tagged with VM_MPX, see the patch "add MPX specific mmap interface"). If userspace use the prctl() indroduced earlier in this patchset to enable the management of bounds tables in kernel, when it unmaps the first type of mapping with the actual data, the kernel needs to free the mapping for the bounds table backing the data. This patch hooks in at the very end of do_unmap() to do so. We look at the addresses being unmapped and find the bounds directory entries and tables which cover those addresses. If an entire table is unused, we clear associated directory entry and free the table. Once we unmap the bounds table, we would have a bounds directory entry pointing at empty address space. That address space might now be allocated for some other (random) use, and the MPX hardware might now try to walk it as if it were a bounds table. That would be bad. So any unmapping of an enture bounds table has to be accompanied by a corresponding write to the bounds directory entry to invalidate it. That write to the bounds directory can fault, which causes the following problem: Since we are doing the freeing from munmap() (and other paths like it), we hold mmap_sem for write. If we fault, the page fault handler will attempt to acquire mmap_sem for read and we will deadlock. To avoid the deadlock, we pagefault_disable() when touching the bounds directory entry and use a get_user_pages() to resolve the fault. The unmapping of bounds tables happends under vm_munmap(). We also (indirectly) call vm_munmap() to _do_ the unmapping of the bounds tables. We avoid unbounded recursion by disallowing freeing of bounds tables *for* bounds tables. This would not occur normally, so should not have any practical impact. Being strict about it here helps ensure that we do not have an exploitable stack overflow. Based-on-patch-by: Qiaowei Ren <qiaowei.ren@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: linux-mm@kvack.org Cc: linux-mips@linux-mips.org Cc: Dave Hansen <dave@sr71.net> Link: http://lkml.kernel.org/r/20141114151831.E4531C4A@viggo.jf.intel.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-rw-r--r--arch/x86/include/asm/mmu_context.h6
-rw-r--r--arch/x86/include/asm/mpx.h14
-rw-r--r--arch/x86/mm/mpx.c393
-rw-r--r--include/asm-generic/mmu_context.h6
-rw-r--r--mm/mmap.c2
5 files changed, 421 insertions, 0 deletions
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 0b0ba91ff1ef..00d4575d5409 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -109,4 +109,10 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
109 mpx_mm_init(mm); 109 mpx_mm_init(mm);
110} 110}
111 111
112static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
113 unsigned long start, unsigned long end)
114{
115 mpx_notify_unmap(mm, vma, start, end);
116}
117
112#endif /* _ASM_X86_MMU_CONTEXT_H */ 118#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h
index 05eecbf8a484..a952a13d59a7 100644
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -51,6 +51,13 @@
51#define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1)) 51#define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1))
52#define MPX_BNDSTA_ERROR_CODE 0x3 52#define MPX_BNDSTA_ERROR_CODE 0x3
53 53
54#define MPX_BD_ENTRY_MASK ((1<<MPX_BD_ENTRY_OFFSET)-1)
55#define MPX_BT_ENTRY_MASK ((1<<MPX_BT_ENTRY_OFFSET)-1)
56#define MPX_GET_BD_ENTRY_OFFSET(addr) ((((addr)>>(MPX_BT_ENTRY_OFFSET+ \
57 MPX_IGN_BITS)) & MPX_BD_ENTRY_MASK) << MPX_BD_ENTRY_SHIFT)
58#define MPX_GET_BT_ENTRY_OFFSET(addr) ((((addr)>>MPX_IGN_BITS) & \
59 MPX_BT_ENTRY_MASK) << MPX_BT_ENTRY_SHIFT)
60
54#ifdef CONFIG_X86_INTEL_MPX 61#ifdef CONFIG_X86_INTEL_MPX
55siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, 62siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
56 struct xsave_struct *xsave_buf); 63 struct xsave_struct *xsave_buf);
@@ -67,6 +74,8 @@ static inline void mpx_mm_init(struct mm_struct *mm)
67 */ 74 */
68 mm->bd_addr = MPX_INVALID_BOUNDS_DIR; 75 mm->bd_addr = MPX_INVALID_BOUNDS_DIR;
69} 76}
77void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
78 unsigned long start, unsigned long end);
70#else 79#else
71static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, 80static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
72 struct xsave_struct *xsave_buf) 81 struct xsave_struct *xsave_buf)
@@ -84,6 +93,11 @@ static inline int kernel_managing_mpx_tables(struct mm_struct *mm)
84static inline void mpx_mm_init(struct mm_struct *mm) 93static inline void mpx_mm_init(struct mm_struct *mm)
85{ 94{
86} 95}
96static inline void mpx_notify_unmap(struct mm_struct *mm,
97 struct vm_area_struct *vma,
98 unsigned long start, unsigned long end)
99{
100}
87#endif /* CONFIG_X86_INTEL_MPX */ 101#endif /* CONFIG_X86_INTEL_MPX */
88 102
89#endif /* _ASM_X86_MPX_H */ 103#endif /* _ASM_X86_MPX_H */
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 96266375441e..f30b48e3a991 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -13,6 +13,7 @@
13#include <asm/i387.h> 13#include <asm/i387.h>
14#include <asm/insn.h> 14#include <asm/insn.h>
15#include <asm/mman.h> 15#include <asm/mman.h>
16#include <asm/mmu_context.h>
16#include <asm/mpx.h> 17#include <asm/mpx.h>
17#include <asm/processor.h> 18#include <asm/processor.h>
18#include <asm/fpu-internal.h> 19#include <asm/fpu-internal.h>
@@ -26,6 +27,11 @@ static struct vm_operations_struct mpx_vma_ops = {
26 .name = mpx_mapping_name, 27 .name = mpx_mapping_name,
27}; 28};
28 29
30static int is_mpx_vma(struct vm_area_struct *vma)
31{
32 return (vma->vm_ops == &mpx_vma_ops);
33}
34
29/* 35/*
30 * This is really a simplified "vm_mmap". it only handles MPX 36 * This is really a simplified "vm_mmap". it only handles MPX
31 * bounds tables (the bounds directory is user-allocated). 37 * bounds tables (the bounds directory is user-allocated).
@@ -534,3 +540,390 @@ int mpx_handle_bd_fault(struct xsave_struct *xsave_buf)
534 } 540 }
535 return 0; 541 return 0;
536} 542}
543
544/*
545 * A thin wrapper around get_user_pages(). Returns 0 if the
546 * fault was resolved or -errno if not.
547 */
548static int mpx_resolve_fault(long __user *addr, int write)
549{
550 long gup_ret;
551 int nr_pages = 1;
552 int force = 0;
553
554 gup_ret = get_user_pages(current, current->mm, (unsigned long)addr,
555 nr_pages, write, force, NULL, NULL);
556 /*
557 * get_user_pages() returns number of pages gotten.
558 * 0 means we failed to fault in and get anything,
559 * probably because 'addr' is bad.
560 */
561 if (!gup_ret)
562 return -EFAULT;
563 /* Other error, return it */
564 if (gup_ret < 0)
565 return gup_ret;
566 /* must have gup'd a page and gup_ret>0, success */
567 return 0;
568}
569
570/*
571 * Get the base of bounds tables pointed by specific bounds
572 * directory entry.
573 */
574static int get_bt_addr(struct mm_struct *mm,
575 long __user *bd_entry, unsigned long *bt_addr)
576{
577 int ret;
578 int valid_bit;
579
580 if (!access_ok(VERIFY_READ, (bd_entry), sizeof(*bd_entry)))
581 return -EFAULT;
582
583 while (1) {
584 int need_write = 0;
585
586 pagefault_disable();
587 ret = get_user(*bt_addr, bd_entry);
588 pagefault_enable();
589 if (!ret)
590 break;
591 if (ret == -EFAULT)
592 ret = mpx_resolve_fault(bd_entry, need_write);
593 /*
594 * If we could not resolve the fault, consider it
595 * userspace's fault and error out.
596 */
597 if (ret)
598 return ret;
599 }
600
601 valid_bit = *bt_addr & MPX_BD_ENTRY_VALID_FLAG;
602 *bt_addr &= MPX_BT_ADDR_MASK;
603
604 /*
605 * When the kernel is managing bounds tables, a bounds directory
606 * entry will either have a valid address (plus the valid bit)
607 * *OR* be completely empty. If we see a !valid entry *and* some
608 * data in the address field, we know something is wrong. This
609 * -EINVAL return will cause a SIGSEGV.
610 */
611 if (!valid_bit && *bt_addr)
612 return -EINVAL;
613 /*
614 * Do we have an completely zeroed bt entry? That is OK. It
615 * just means there was no bounds table for this memory. Make
616 * sure to distinguish this from -EINVAL, which will cause
617 * a SEGV.
618 */
619 if (!valid_bit)
620 return -ENOENT;
621
622 return 0;
623}
624
625/*
626 * Free the backing physical pages of bounds table 'bt_addr'.
627 * Assume start...end is within that bounds table.
628 */
629static int zap_bt_entries(struct mm_struct *mm,
630 unsigned long bt_addr,
631 unsigned long start, unsigned long end)
632{
633 struct vm_area_struct *vma;
634 unsigned long addr, len;
635
636 /*
637 * Find the first overlapping vma. If vma->vm_start > start, there
638 * will be a hole in the bounds table. This -EINVAL return will
639 * cause a SIGSEGV.
640 */
641 vma = find_vma(mm, start);
642 if (!vma || vma->vm_start > start)
643 return -EINVAL;
644
645 /*
646 * A NUMA policy on a VM_MPX VMA could cause this bouds table to
647 * be split. So we need to look across the entire 'start -> end'
648 * range of this bounds table, find all of the VM_MPX VMAs, and
649 * zap only those.
650 */
651 addr = start;
652 while (vma && vma->vm_start < end) {
653 /*
654 * We followed a bounds directory entry down
655 * here. If we find a non-MPX VMA, that's bad,
656 * so stop immediately and return an error. This
657 * probably results in a SIGSEGV.
658 */
659 if (!is_mpx_vma(vma))
660 return -EINVAL;
661
662 len = min(vma->vm_end, end) - addr;
663 zap_page_range(vma, addr, len, NULL);
664
665 vma = vma->vm_next;
666 addr = vma->vm_start;
667 }
668
669 return 0;
670}
671
672static int unmap_single_bt(struct mm_struct *mm,
673 long __user *bd_entry, unsigned long bt_addr)
674{
675 unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
676 unsigned long actual_old_val = 0;
677 int ret;
678
679 while (1) {
680 int need_write = 1;
681
682 pagefault_disable();
683 ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry,
684 expected_old_val, 0);
685 pagefault_enable();
686 if (!ret)
687 break;
688 if (ret == -EFAULT)
689 ret = mpx_resolve_fault(bd_entry, need_write);
690 /*
691 * If we could not resolve the fault, consider it
692 * userspace's fault and error out.
693 */
694 if (ret)
695 return ret;
696 }
697 /*
698 * The cmpxchg was performed, check the results.
699 */
700 if (actual_old_val != expected_old_val) {
701 /*
702 * Someone else raced with us to unmap the table.
703 * There was no bounds table pointed to by the
704 * directory, so declare success. Somebody freed
705 * it.
706 */
707 if (!actual_old_val)
708 return 0;
709 /*
710 * Something messed with the bounds directory
711 * entry. We hold mmap_sem for read or write
712 * here, so it could not be a _new_ bounds table
713 * that someone just allocated. Something is
714 * wrong, so pass up the error and SIGSEGV.
715 */
716 return -EINVAL;
717 }
718
719 /*
720 * Note, we are likely being called under do_munmap() already. To
721 * avoid recursion, do_munmap() will check whether it comes
722 * from one bounds table through VM_MPX flag.
723 */
724 return do_munmap(mm, bt_addr, MPX_BT_SIZE_BYTES);
725}
726
727/*
728 * If the bounds table pointed by bounds directory 'bd_entry' is
729 * not shared, unmap this whole bounds table. Otherwise, only free
730 * those backing physical pages of bounds table entries covered
731 * in this virtual address region start...end.
732 */
733static int unmap_shared_bt(struct mm_struct *mm,
734 long __user *bd_entry, unsigned long start,
735 unsigned long end, bool prev_shared, bool next_shared)
736{
737 unsigned long bt_addr;
738 int ret;
739
740 ret = get_bt_addr(mm, bd_entry, &bt_addr);
741 /*
742 * We could see an "error" ret for not-present bounds
743 * tables (not really an error), or actual errors, but
744 * stop unmapping either way.
745 */
746 if (ret)
747 return ret;
748
749 if (prev_shared && next_shared)
750 ret = zap_bt_entries(mm, bt_addr,
751 bt_addr+MPX_GET_BT_ENTRY_OFFSET(start),
752 bt_addr+MPX_GET_BT_ENTRY_OFFSET(end));
753 else if (prev_shared)
754 ret = zap_bt_entries(mm, bt_addr,
755 bt_addr+MPX_GET_BT_ENTRY_OFFSET(start),
756 bt_addr+MPX_BT_SIZE_BYTES);
757 else if (next_shared)
758 ret = zap_bt_entries(mm, bt_addr, bt_addr,
759 bt_addr+MPX_GET_BT_ENTRY_OFFSET(end));
760 else
761 ret = unmap_single_bt(mm, bd_entry, bt_addr);
762
763 return ret;
764}
765
766/*
767 * A virtual address region being munmap()ed might share bounds table
768 * with adjacent VMAs. We only need to free the backing physical
769 * memory of these shared bounds tables entries covered in this virtual
770 * address region.
771 */
772static int unmap_edge_bts(struct mm_struct *mm,
773 unsigned long start, unsigned long end)
774{
775 int ret;
776 long __user *bde_start, *bde_end;
777 struct vm_area_struct *prev, *next;
778 bool prev_shared = false, next_shared = false;
779
780 bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start);
781 bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1);
782
783 /*
784 * Check whether bde_start and bde_end are shared with adjacent
785 * VMAs.
786 *
787 * We already unliked the VMAs from the mm's rbtree so 'start'
788 * is guaranteed to be in a hole. This gets us the first VMA
789 * before the hole in to 'prev' and the next VMA after the hole
790 * in to 'next'.
791 */
792 next = find_vma_prev(mm, start, &prev);
793 if (prev && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(prev->vm_end-1))
794 == bde_start)
795 prev_shared = true;
796 if (next && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(next->vm_start))
797 == bde_end)
798 next_shared = true;
799
800 /*
801 * This virtual address region being munmap()ed is only
802 * covered by one bounds table.
803 *
804 * In this case, if this table is also shared with adjacent
805 * VMAs, only part of the backing physical memory of the bounds
806 * table need be freeed. Otherwise the whole bounds table need
807 * be unmapped.
808 */
809 if (bde_start == bde_end) {
810 return unmap_shared_bt(mm, bde_start, start, end,
811 prev_shared, next_shared);
812 }
813
814 /*
815 * If more than one bounds tables are covered in this virtual
816 * address region being munmap()ed, we need to separately check
817 * whether bde_start and bde_end are shared with adjacent VMAs.
818 */
819 ret = unmap_shared_bt(mm, bde_start, start, end, prev_shared, false);
820 if (ret)
821 return ret;
822 ret = unmap_shared_bt(mm, bde_end, start, end, false, next_shared);
823 if (ret)
824 return ret;
825
826 return 0;
827}
828
829static int mpx_unmap_tables(struct mm_struct *mm,
830 unsigned long start, unsigned long end)
831{
832 int ret;
833 long __user *bd_entry, *bde_start, *bde_end;
834 unsigned long bt_addr;
835
836 /*
837 * "Edge" bounds tables are those which are being used by the region
838 * (start -> end), but that may be shared with adjacent areas. If they
839 * turn out to be completely unshared, they will be freed. If they are
840 * shared, we will free the backing store (like an MADV_DONTNEED) for
841 * areas used by this region.
842 */
843 ret = unmap_edge_bts(mm, start, end);
844 switch (ret) {
845 /* non-present tables are OK */
846 case 0:
847 case -ENOENT:
848 /* Success, or no tables to unmap */
849 break;
850 case -EINVAL:
851 case -EFAULT:
852 default:
853 return ret;
854 }
855
856 /*
857 * Only unmap the bounds table that are
858 * 1. fully covered
859 * 2. not at the edges of the mapping, even if full aligned
860 */
861 bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start);
862 bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1);
863 for (bd_entry = bde_start + 1; bd_entry < bde_end; bd_entry++) {
864 ret = get_bt_addr(mm, bd_entry, &bt_addr);
865 switch (ret) {
866 case 0:
867 break;
868 case -ENOENT:
869 /* No table here, try the next one */
870 continue;
871 case -EINVAL:
872 case -EFAULT:
873 default:
874 /*
875 * Note: we are being strict here.
876 * Any time we run in to an issue
877 * unmapping tables, we stop and
878 * SIGSEGV.
879 */
880 return ret;
881 }
882
883 ret = unmap_single_bt(mm, bd_entry, bt_addr);
884 if (ret)
885 return ret;
886 }
887
888 return 0;
889}
890
891/*
892 * Free unused bounds tables covered in a virtual address region being
893 * munmap()ed. Assume end > start.
894 *
895 * This function will be called by do_munmap(), and the VMAs covering
896 * the virtual address region start...end have already been split if
897 * necessary, and the 'vma' is the first vma in this range (start -> end).
898 */
899void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
900 unsigned long start, unsigned long end)
901{
902 int ret;
903
904 /*
905 * Refuse to do anything unless userspace has asked
906 * the kernel to help manage the bounds tables,
907 */
908 if (!kernel_managing_mpx_tables(current->mm))
909 return;
910 /*
911 * This will look across the entire 'start -> end' range,
912 * and find all of the non-VM_MPX VMAs.
913 *
914 * To avoid recursion, if a VM_MPX vma is found in the range
915 * (start->end), we will not continue follow-up work. This
916 * recursion represents having bounds tables for bounds tables,
917 * which should not occur normally. Being strict about it here
918 * helps ensure that we do not have an exploitable stack overflow.
919 */
920 do {
921 if (vma->vm_flags & VM_MPX)
922 return;
923 vma = vma->vm_next;
924 } while (vma && vma->vm_start < end);
925
926 ret = mpx_unmap_tables(mm, start, end);
927 if (ret)
928 force_sig(SIGSEGV, current);
929}
diff --git a/include/asm-generic/mmu_context.h b/include/asm-generic/mmu_context.h
index 1f2a8f9c9264..aa2d8ba35b20 100644
--- a/include/asm-generic/mmu_context.h
+++ b/include/asm-generic/mmu_context.h
@@ -47,4 +47,10 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
47{ 47{
48} 48}
49 49
50static inline void arch_unmap(struct mm_struct *mm,
51 struct vm_area_struct *vma,
52 unsigned long start, unsigned long end)
53{
54}
55
50#endif /* __ASM_GENERIC_MMU_CONTEXT_H */ 56#endif /* __ASM_GENERIC_MMU_CONTEXT_H */
diff --git a/mm/mmap.c b/mm/mmap.c
index 87e82b38453c..814762c15631 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2597,6 +2597,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2597 detach_vmas_to_be_unmapped(mm, vma, prev, end); 2597 detach_vmas_to_be_unmapped(mm, vma, prev, end);
2598 unmap_region(mm, vma, prev, start, end); 2598 unmap_region(mm, vma, prev, start, end);
2599 2599
2600 arch_unmap(mm, vma, start, end);
2601
2600 /* Fix up all other VM information */ 2602 /* Fix up all other VM information */
2601 remove_vma_list(mm, vma); 2603 remove_vma_list(mm, vma);
2602 2604