aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2016-10-07 20:01:28 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-10-07 21:46:29 -0400
commite86f15ee64d8ee46255d964d55f74f5ba9af8c36 (patch)
tree038bb07471fe5673221880f24ca06716403cad42
parentfb8c41e9ad1f356b06b46a63ada10b7dce2a5d94 (diff)
mm: vma_merge: fix vm_page_prot SMP race condition against rmap_walk
The rmap_walk can access vm_page_prot (and potentially vm_flags in the pte/pmd manipulations). So it's not safe to wait the caller to update the vm_page_prot/vm_flags after vma_merge returned potentially removing the "next" vma and extending the "current" vma over the next->vm_start,vm_end range, but still with the "current" vma vm_page_prot, after releasing the rmap locks. The vm_page_prot/vm_flags must be transferred from the "next" vma to the current vma while vma_merge still holds the rmap locks. The side effect of this race condition is pte corruption during migrate as remove_migration_ptes when run on a address of the "next" vma that got removed, used the vm_page_prot of the current vma. migrate mprotect ------------ ------------- migrating in "next" vma vma_merge() # removes "next" vma and # extends "current" vma # current vma is not with # vm_page_prot updated remove_migration_ptes read vm_page_prot of current "vma" establish pte with wrong permissions vm_set_page_prot(vma) # too late! change_protection in the old vma range only, next range is not updated This caused segmentation faults and potentially memory corruption in heavy mprotect loads with some light page migration caused by compaction in the background. Hugh Dickins pointed out the comment about the Odd case 8 in vma_merge which confirms the case 8 is only buggy one where the race can trigger, in all other vma_merge cases the above cannot happen. This fix removes the oddness factor from case 8 and it converts it from: AAAA PPPPNNNNXXXX -> PPPPNNNNNNNN to: AAAA PPPPNNNNXXXX -> PPPPXXXXXXXX XXXX has the right vma properties for the whole merged vma returned by vma_adjust, so it solves the problem fully. It has the added benefits that the callers could stop updating vma properties when vma_merge succeeds however the callers are not updated by this patch (there are bits like VM_SOFTDIRTY that still need special care for the whole range, as the vma merging ignores them, but as long as they're not processed by rmap walks and instead they're accessed with the mmap_sem at least for reading, they are fine not to be updated within vma_adjust before releasing the rmap_locks). Link: http://lkml.kernel.org/r/1474309513-20313-1-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reported-by: Aditya Mandaleeka <adityam@microsoft.com> Cc: Rik van Riel <riel@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Jan Vorlicek <janvorli@microsoft.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mm.h10
-rw-r--r--mm/mmap.c157
-rw-r--r--mm/mprotect.c1
3 files changed, 139 insertions, 29 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 040a04a88996..2c8ed8a894c8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1968,8 +1968,14 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
1968 1968
1969/* mmap.c */ 1969/* mmap.c */
1970extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); 1970extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
1971extern int vma_adjust(struct vm_area_struct *vma, unsigned long start, 1971extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
1972 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert); 1972 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
1973 struct vm_area_struct *expand);
1974static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
1975 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
1976{
1977 return __vma_adjust(vma, start, end, pgoff, insert, NULL);
1978}
1973extern struct vm_area_struct *vma_merge(struct mm_struct *, 1979extern struct vm_area_struct *vma_merge(struct mm_struct *,
1974 struct vm_area_struct *prev, unsigned long addr, unsigned long end, 1980 struct vm_area_struct *prev, unsigned long addr, unsigned long end,
1975 unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, 1981 unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
diff --git a/mm/mmap.c b/mm/mmap.c
index 183694b80bcc..e53637f8ac42 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -601,14 +601,24 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
601 mm->map_count++; 601 mm->map_count++;
602} 602}
603 603
604static inline void 604static __always_inline void __vma_unlink_common(struct mm_struct *mm,
605__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, 605 struct vm_area_struct *vma,
606 struct vm_area_struct *prev) 606 struct vm_area_struct *prev,
607 bool has_prev)
607{ 608{
608 struct vm_area_struct *next; 609 struct vm_area_struct *next;
609 610
610 vma_rb_erase(vma, &mm->mm_rb); 611 vma_rb_erase(vma, &mm->mm_rb);
611 prev->vm_next = next = vma->vm_next; 612 next = vma->vm_next;
613 if (has_prev)
614 prev->vm_next = next;
615 else {
616 prev = vma->vm_prev;
617 if (prev)
618 prev->vm_next = next;
619 else
620 mm->mmap = next;
621 }
612 if (next) 622 if (next)
613 next->vm_prev = prev; 623 next->vm_prev = prev;
614 624
@@ -616,6 +626,19 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
616 vmacache_invalidate(mm); 626 vmacache_invalidate(mm);
617} 627}
618 628
629static inline void __vma_unlink_prev(struct mm_struct *mm,
630 struct vm_area_struct *vma,
631 struct vm_area_struct *prev)
632{
633 __vma_unlink_common(mm, vma, prev, true);
634}
635
636static inline void __vma_unlink(struct mm_struct *mm,
637 struct vm_area_struct *vma)
638{
639 __vma_unlink_common(mm, vma, NULL, false);
640}
641
619/* 642/*
620 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that 643 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
621 * is already present in an i_mmap tree without adjusting the tree. 644 * is already present in an i_mmap tree without adjusting the tree.
@@ -623,11 +646,12 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
623 * are necessary. The "insert" vma (if any) is to be inserted 646 * are necessary. The "insert" vma (if any) is to be inserted
624 * before we drop the necessary locks. 647 * before we drop the necessary locks.
625 */ 648 */
626int vma_adjust(struct vm_area_struct *vma, unsigned long start, 649int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
627 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) 650 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
651 struct vm_area_struct *expand)
628{ 652{
629 struct mm_struct *mm = vma->vm_mm; 653 struct mm_struct *mm = vma->vm_mm;
630 struct vm_area_struct *next = vma->vm_next; 654 struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
631 struct address_space *mapping = NULL; 655 struct address_space *mapping = NULL;
632 struct rb_root *root = NULL; 656 struct rb_root *root = NULL;
633 struct anon_vma *anon_vma = NULL; 657 struct anon_vma *anon_vma = NULL;
@@ -643,9 +667,38 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
643 /* 667 /*
644 * vma expands, overlapping all the next, and 668 * vma expands, overlapping all the next, and
645 * perhaps the one after too (mprotect case 6). 669 * perhaps the one after too (mprotect case 6).
670 * The only two other cases that gets here are
671 * case 1, case 7 and case 8.
646 */ 672 */
647 remove_next = 1 + (end > next->vm_end); 673 if (next == expand) {
648 end = next->vm_end; 674 /*
675 * The only case where we don't expand "vma"
676 * and we expand "next" instead is case 8.
677 */
678 VM_WARN_ON(end != next->vm_end);
679 /*
680 * remove_next == 3 means we're
681 * removing "vma" and that to do so we
682 * swapped "vma" and "next".
683 */
684 remove_next = 3;
685 VM_WARN_ON(file != next->vm_file);
686 swap(vma, next);
687 } else {
688 VM_WARN_ON(expand != vma);
689 /*
690 * case 1, 6, 7, remove_next == 2 is case 6,
691 * remove_next == 1 is case 1 or 7.
692 */
693 remove_next = 1 + (end > next->vm_end);
694 VM_WARN_ON(remove_next == 2 &&
695 end != next->vm_next->vm_end);
696 VM_WARN_ON(remove_next == 1 &&
697 end != next->vm_end);
698 /* trim end to next, for case 6 first pass */
699 end = next->vm_end;
700 }
701
649 exporter = next; 702 exporter = next;
650 importer = vma; 703 importer = vma;
651 704
@@ -664,6 +717,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
664 adjust_next = (end - next->vm_start) >> PAGE_SHIFT; 717 adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
665 exporter = next; 718 exporter = next;
666 importer = vma; 719 importer = vma;
720 VM_WARN_ON(expand != importer);
667 } else if (end < vma->vm_end) { 721 } else if (end < vma->vm_end) {
668 /* 722 /*
669 * vma shrinks, and !insert tells it's not 723 * vma shrinks, and !insert tells it's not
@@ -673,6 +727,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
673 adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); 727 adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
674 exporter = vma; 728 exporter = vma;
675 importer = next; 729 importer = next;
730 VM_WARN_ON(expand != importer);
676 } 731 }
677 732
678 /* 733 /*
@@ -690,7 +745,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
690 } 745 }
691 } 746 }
692again: 747again:
693 vma_adjust_trans_huge(vma, start, end, adjust_next); 748 vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
694 749
695 if (file) { 750 if (file) {
696 mapping = file->f_mapping; 751 mapping = file->f_mapping;
@@ -716,8 +771,8 @@ again:
716 if (!anon_vma && adjust_next) 771 if (!anon_vma && adjust_next)
717 anon_vma = next->anon_vma; 772 anon_vma = next->anon_vma;
718 if (anon_vma) { 773 if (anon_vma) {
719 VM_BUG_ON_VMA(adjust_next && next->anon_vma && 774 VM_WARN_ON(adjust_next && next->anon_vma &&
720 anon_vma != next->anon_vma, next); 775 anon_vma != next->anon_vma);
721 anon_vma_lock_write(anon_vma); 776 anon_vma_lock_write(anon_vma);
722 anon_vma_interval_tree_pre_update_vma(vma); 777 anon_vma_interval_tree_pre_update_vma(vma);
723 if (adjust_next) 778 if (adjust_next)
@@ -757,7 +812,11 @@ again:
757 * vma_merge has merged next into vma, and needs 812 * vma_merge has merged next into vma, and needs
758 * us to remove next before dropping the locks. 813 * us to remove next before dropping the locks.
759 */ 814 */
760 __vma_unlink(mm, next, vma); 815 if (remove_next != 3)
816 __vma_unlink_prev(mm, next, vma);
817 else
818 /* vma is not before next if they've been swapped */
819 __vma_unlink(mm, next);
761 if (file) 820 if (file)
762 __remove_shared_vm_struct(next, file, mapping); 821 __remove_shared_vm_struct(next, file, mapping);
763 } else if (insert) { 822 } else if (insert) {
@@ -809,7 +868,27 @@ again:
809 * we must remove another next too. It would clutter 868 * we must remove another next too. It would clutter
810 * up the code too much to do both in one go. 869 * up the code too much to do both in one go.
811 */ 870 */
812 next = vma->vm_next; 871 if (remove_next != 3) {
872 /*
873 * If "next" was removed and vma->vm_end was
874 * expanded (up) over it, in turn
875 * "next->vm_prev->vm_end" changed and the
876 * "vma->vm_next" gap must be updated.
877 */
878 next = vma->vm_next;
879 } else {
880 /*
881 * For the scope of the comment "next" and
882 * "vma" considered pre-swap(): if "vma" was
883 * removed, next->vm_start was expanded (down)
884 * over it and the "next" gap must be updated.
885 * Because of the swap() the post-swap() "vma"
886 * actually points to pre-swap() "next"
887 * (post-swap() "next" as opposed is now a
888 * dangling pointer).
889 */
890 next = vma;
891 }
813 if (remove_next == 2) { 892 if (remove_next == 2) {
814 remove_next = 1; 893 remove_next = 1;
815 end = next->vm_end; 894 end = next->vm_end;
@@ -958,13 +1037,24 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
958 * cannot merge might become might become might become 1037 * cannot merge might become might become might become
959 * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or 1038 * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or
960 * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or 1039 * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or
961 * mremap move: PPPPNNNNNNNN 8 1040 * mremap move: PPPPXXXXXXXX 8
962 * AAAA 1041 * AAAA
963 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN 1042 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
964 * might become case 1 below case 2 below case 3 below 1043 * might become case 1 below case 2 below case 3 below
965 * 1044 *
966 * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: 1045 * It is important for case 8 that the the vma NNNN overlapping the
967 * mprotect_fixup updates vm_flags & vm_page_prot on successful return. 1046 * region AAAA is never going to extended over XXXX. Instead XXXX must
1047 * be extended in region AAAA and NNNN must be removed. This way in
1048 * all cases where vma_merge succeeds, the moment vma_adjust drops the
1049 * rmap_locks, the properties of the merged vma will be already
1050 * correct for the whole merged range. Some of those properties like
1051 * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
1052 * be correct for the whole merged range immediately after the
1053 * rmap_locks are released. Otherwise if XXXX would be removed and
1054 * NNNN would be extended over the XXXX range, remove_migration_ptes
1055 * or other rmap walkers (if working on addresses beyond the "end"
1056 * parameter) may establish ptes with the wrong permissions of NNNN
1057 * instead of the right permissions of XXXX.
968 */ 1058 */
969struct vm_area_struct *vma_merge(struct mm_struct *mm, 1059struct vm_area_struct *vma_merge(struct mm_struct *mm,
970 struct vm_area_struct *prev, unsigned long addr, 1060 struct vm_area_struct *prev, unsigned long addr,
@@ -989,9 +1079,14 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
989 else 1079 else
990 next = mm->mmap; 1080 next = mm->mmap;
991 area = next; 1081 area = next;
992 if (next && next->vm_end == end) /* cases 6, 7, 8 */ 1082 if (area && area->vm_end == end) /* cases 6, 7, 8 */
993 next = next->vm_next; 1083 next = next->vm_next;
994 1084
1085 /* verify some invariant that must be enforced by the caller */
1086 VM_WARN_ON(prev && addr <= prev->vm_start);
1087 VM_WARN_ON(area && end > area->vm_end);
1088 VM_WARN_ON(addr >= end);
1089
995 /* 1090 /*
996 * Can it merge with the predecessor? 1091 * Can it merge with the predecessor?
997 */ 1092 */
@@ -1012,11 +1107,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
1012 is_mergeable_anon_vma(prev->anon_vma, 1107 is_mergeable_anon_vma(prev->anon_vma,
1013 next->anon_vma, NULL)) { 1108 next->anon_vma, NULL)) {
1014 /* cases 1, 6 */ 1109 /* cases 1, 6 */
1015 err = vma_adjust(prev, prev->vm_start, 1110 err = __vma_adjust(prev, prev->vm_start,
1016 next->vm_end, prev->vm_pgoff, NULL); 1111 next->vm_end, prev->vm_pgoff, NULL,
1112 prev);
1017 } else /* cases 2, 5, 7 */ 1113 } else /* cases 2, 5, 7 */
1018 err = vma_adjust(prev, prev->vm_start, 1114 err = __vma_adjust(prev, prev->vm_start,
1019 end, prev->vm_pgoff, NULL); 1115 end, prev->vm_pgoff, NULL, prev);
1020 if (err) 1116 if (err)
1021 return NULL; 1117 return NULL;
1022 khugepaged_enter_vma_merge(prev, vm_flags); 1118 khugepaged_enter_vma_merge(prev, vm_flags);
@@ -1032,11 +1128,18 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
1032 anon_vma, file, pgoff+pglen, 1128 anon_vma, file, pgoff+pglen,
1033 vm_userfaultfd_ctx)) { 1129 vm_userfaultfd_ctx)) {
1034 if (prev && addr < prev->vm_end) /* case 4 */ 1130 if (prev && addr < prev->vm_end) /* case 4 */
1035 err = vma_adjust(prev, prev->vm_start, 1131 err = __vma_adjust(prev, prev->vm_start,
1036 addr, prev->vm_pgoff, NULL); 1132 addr, prev->vm_pgoff, NULL, next);
1037 else /* cases 3, 8 */ 1133 else { /* cases 3, 8 */
1038 err = vma_adjust(area, addr, next->vm_end, 1134 err = __vma_adjust(area, addr, next->vm_end,
1039 next->vm_pgoff - pglen, NULL); 1135 next->vm_pgoff - pglen, NULL, next);
1136 /*
1137 * In case 3 area is already equal to next and
1138 * this is a noop, but in case 8 "area" has
1139 * been removed and next was expanded over it.
1140 */
1141 area = next;
1142 }
1040 if (err) 1143 if (err)
1041 return NULL; 1144 return NULL;
1042 khugepaged_enter_vma_merge(area, vm_flags); 1145 khugepaged_enter_vma_merge(area, vm_flags);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 063bbed22c7b..ec91dfd3f900 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -304,6 +304,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
304 vma->vm_userfaultfd_ctx); 304 vma->vm_userfaultfd_ctx);
305 if (*pprev) { 305 if (*pprev) {
306 vma = *pprev; 306 vma = *pprev;
307 VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
307 goto success; 308 goto success;
308 } 309 }
309 310