aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMinchan Kim <minchan@kernel.org>2017-08-10 18:24:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-08-10 18:54:07 -0400
commitb3a81d0841a954a3d3e782059960f53e63b37066 (patch)
treefde4e0d90345940a62f962667784df06e4317769
parent99baac21e4585f4258f919502c6e23f1e5edc98c (diff)
mm: fix KSM data corruption
Nadav reported KSM can corrupt the user data by the TLB batching race[1]. That means data user written can be lost. Quote from Nadav Amit: "For this race we need 4 CPUs: CPU0: Caches a writable and dirty PTE entry, and uses the stale value for write later. CPU1: Runs madvise_free on the range that includes the PTE. It would clear the dirty-bit. It batches TLB flushes. CPU2: Writes 4 to /proc/PID/clear_refs , clearing the PTEs soft-dirty. We care about the fact that it clears the PTE write-bit, and of course, batches TLB flushes. CPU3: Runs KSM. Our purpose is to pass the following test in write_protect_page(): if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte))) Since it will avoid TLB flush. And we want to do it while the PTE is stale. Later, and before replacing the page, we would be able to change the page. Note that all the operations the CPU1-3 perform canhappen in parallel since they only acquire mmap_sem for read. We start with two identical pages. Everything below regards the same page/PTE. CPU0 CPU1 CPU2 CPU3 ---- ---- ---- ---- Write the same value on page [cache PTE as dirty in TLB] MADV_FREE pte_mkclean() 4 > clear_refs pte_wrprotect() write_protect_page() [ success, no flush ] pages_indentical() [ ok ] Write to page different value [Ok, using stale PTE] replace_page() Later, CPU1, CPU2 and CPU3 would flush the TLB, but that is too late. CPU0 already wrote on the page, but KSM ignored this write, and it got lost" In above scenario, MADV_FREE is fixed by changing TLB batching API including [set|clear]_tlb_flush_pending. Remained thing is soft-dirty part. This patch changes soft-dirty uses TLB batching API instead of flush_tlb_mm and KSM checks pending TLB flush by using mm_tlb_flush_pending so that it will flush TLB to avoid data lost if there are other parallel threads pending TLB flush. [1] http://lkml.kernel.org/r/BD3A0EBE-ECF4-41D4-87FA-C755EA9AB6BD@gmail.com Link: http://lkml.kernel.org/r/20170802000818.4760-8-namit@vmware.com Signed-off-by: Minchan Kim <minchan@kernel.org> Signed-off-by: Nadav Amit <namit@vmware.com> Reported-by: Nadav Amit <namit@vmware.com> Tested-by: Nadav Amit <namit@vmware.com> Reviewed-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Hugh Dickins <hughd@google.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Andy Lutomirski <luto@kernel.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jeff Dike <jdike@addtoit.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Nadav Amit <nadav.amit@gmail.com> Cc: Rik van Riel <riel@redhat.com> Cc: Russell King <linux@armlinux.org.uk> Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> Cc: Tony Luck <tony.luck@intel.com> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/proc/task_mmu.c7
-rw-r--r--mm/ksm.c3
2 files changed, 7 insertions, 3 deletions
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b836fd61ed87..fe8f3265e877 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -16,9 +16,10 @@
16#include <linux/mmu_notifier.h> 16#include <linux/mmu_notifier.h>
17#include <linux/page_idle.h> 17#include <linux/page_idle.h>
18#include <linux/shmem_fs.h> 18#include <linux/shmem_fs.h>
19#include <linux/uaccess.h>
19 20
20#include <asm/elf.h> 21#include <asm/elf.h>
21#include <linux/uaccess.h> 22#include <asm/tlb.h>
22#include <asm/tlbflush.h> 23#include <asm/tlbflush.h>
23#include "internal.h" 24#include "internal.h"
24 25
@@ -1008,6 +1009,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
1008 struct mm_struct *mm; 1009 struct mm_struct *mm;
1009 struct vm_area_struct *vma; 1010 struct vm_area_struct *vma;
1010 enum clear_refs_types type; 1011 enum clear_refs_types type;
1012 struct mmu_gather tlb;
1011 int itype; 1013 int itype;
1012 int rv; 1014 int rv;
1013 1015
@@ -1054,6 +1056,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
1054 } 1056 }
1055 1057
1056 down_read(&mm->mmap_sem); 1058 down_read(&mm->mmap_sem);
1059 tlb_gather_mmu(&tlb, mm, 0, -1);
1057 if (type == CLEAR_REFS_SOFT_DIRTY) { 1060 if (type == CLEAR_REFS_SOFT_DIRTY) {
1058 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1061 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1059 if (!(vma->vm_flags & VM_SOFTDIRTY)) 1062 if (!(vma->vm_flags & VM_SOFTDIRTY))
@@ -1075,7 +1078,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
1075 walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); 1078 walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
1076 if (type == CLEAR_REFS_SOFT_DIRTY) 1079 if (type == CLEAR_REFS_SOFT_DIRTY)
1077 mmu_notifier_invalidate_range_end(mm, 0, -1); 1080 mmu_notifier_invalidate_range_end(mm, 0, -1);
1078 flush_tlb_mm(mm); 1081 tlb_finish_mmu(&tlb, 0, -1);
1079 up_read(&mm->mmap_sem); 1082 up_read(&mm->mmap_sem);
1080out_mm: 1083out_mm:
1081 mmput(mm); 1084 mmput(mm);
diff --git a/mm/ksm.c b/mm/ksm.c
index 4dc92f138786..db20f8436bc3 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1038,7 +1038,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1038 goto out_unlock; 1038 goto out_unlock;
1039 1039
1040 if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || 1040 if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
1041 (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte))) { 1041 (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
1042 mm_tlb_flush_pending(mm)) {
1042 pte_t entry; 1043 pte_t entry;
1043 1044
1044 swapped = PageSwapCache(page); 1045 swapped = PageSwapCache(page);