aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/vm/soft-dirty.txt7
-rw-r--r--fs/exec.c2
-rw-r--r--fs/proc/task_mmu.c46
-rw-r--r--include/linux/mm.h6
-rw-r--r--mm/mmap.c12
5 files changed, 61 insertions, 12 deletions
diff --git a/Documentation/vm/soft-dirty.txt b/Documentation/vm/soft-dirty.txt
index 9a12a5956bc0..55684d11a1e8 100644
--- a/Documentation/vm/soft-dirty.txt
+++ b/Documentation/vm/soft-dirty.txt
@@ -28,6 +28,13 @@ This is so, since the pages are still mapped to physical memory, and thus all
28the kernel does is finds this fact out and puts both writable and soft-dirty 28the kernel does is finds this fact out and puts both writable and soft-dirty
29bits on the PTE. 29bits on the PTE.
30 30
31 While in most cases tracking memory changes by #PF-s is more than enough
32there is still a scenario when we can lose soft dirty bits -- a task
33unmaps a previously mapped memory region and then maps a new one at exactly
34the same place. When unmap is called, the kernel internally clears PTE values
35including soft dirty bits. To notify user space application about such
36memory region renewal the kernel always marks new memory regions (and
37expanded regions) as soft dirty.
31 38
32 This feature is actively used by the checkpoint-restore project. You 39 This feature is actively used by the checkpoint-restore project. You
33can find more details about it on http://criu.org 40can find more details about it on http://criu.org
diff --git a/fs/exec.c b/fs/exec.c
index fd774c7cb483..2d1e52a58fe9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -266,7 +266,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
266 BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); 266 BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
267 vma->vm_end = STACK_TOP_MAX; 267 vma->vm_end = STACK_TOP_MAX;
268 vma->vm_start = vma->vm_end - PAGE_SIZE; 268 vma->vm_start = vma->vm_end - PAGE_SIZE;
269 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; 269 vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
270 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 270 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
271 INIT_LIST_HEAD(&vma->anon_vma_chain); 271 INIT_LIST_HEAD(&vma->anon_vma_chain);
272 272
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 107d026f5d6e..09228639b83d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -740,6 +740,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
740 ptent = pte_file_clear_soft_dirty(ptent); 740 ptent = pte_file_clear_soft_dirty(ptent);
741 } 741 }
742 742
743 if (vma->vm_flags & VM_SOFTDIRTY)
744 vma->vm_flags &= ~VM_SOFTDIRTY;
745
743 set_pte_at(vma->vm_mm, addr, pte, ptent); 746 set_pte_at(vma->vm_mm, addr, pte, ptent);
744#endif 747#endif
745} 748}
@@ -949,13 +952,15 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
949 if (is_migration_entry(entry)) 952 if (is_migration_entry(entry))
950 page = migration_entry_to_page(entry); 953 page = migration_entry_to_page(entry);
951 } else { 954 } else {
952 *pme = make_pme(PM_NOT_PRESENT(pm->v2)); 955 if (vma->vm_flags & VM_SOFTDIRTY)
956 flags2 |= __PM_SOFT_DIRTY;
957 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
953 return; 958 return;
954 } 959 }
955 960
956 if (page && !PageAnon(page)) 961 if (page && !PageAnon(page))
957 flags |= PM_FILE; 962 flags |= PM_FILE;
958 if (pte_soft_dirty(pte)) 963 if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte))
959 flags2 |= __PM_SOFT_DIRTY; 964 flags2 |= __PM_SOFT_DIRTY;
960 965
961 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); 966 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
@@ -974,7 +979,7 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *p
974 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) 979 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
975 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); 980 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
976 else 981 else
977 *pme = make_pme(PM_NOT_PRESENT(pm->v2)); 982 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
978} 983}
979#else 984#else
980static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 985static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
@@ -997,7 +1002,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
997 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { 1002 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
998 int pmd_flags2; 1003 int pmd_flags2;
999 1004
1000 pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0); 1005 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
1006 pmd_flags2 = __PM_SOFT_DIRTY;
1007 else
1008 pmd_flags2 = 0;
1009
1001 for (; addr != end; addr += PAGE_SIZE) { 1010 for (; addr != end; addr += PAGE_SIZE) {
1002 unsigned long offset; 1011 unsigned long offset;
1003 1012
@@ -1015,12 +1024,17 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1015 if (pmd_trans_unstable(pmd)) 1024 if (pmd_trans_unstable(pmd))
1016 return 0; 1025 return 0;
1017 for (; addr != end; addr += PAGE_SIZE) { 1026 for (; addr != end; addr += PAGE_SIZE) {
1027 int flags2;
1018 1028
1019 /* check to see if we've left 'vma' behind 1029 /* check to see if we've left 'vma' behind
1020 * and need a new, higher one */ 1030 * and need a new, higher one */
1021 if (vma && (addr >= vma->vm_end)) { 1031 if (vma && (addr >= vma->vm_end)) {
1022 vma = find_vma(walk->mm, addr); 1032 vma = find_vma(walk->mm, addr);
1023 pme = make_pme(PM_NOT_PRESENT(pm->v2)); 1033 if (vma && (vma->vm_flags & VM_SOFTDIRTY))
1034 flags2 = __PM_SOFT_DIRTY;
1035 else
1036 flags2 = 0;
1037 pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
1024 } 1038 }
1025 1039
1026 /* check that 'vma' actually covers this address, 1040 /* check that 'vma' actually covers this address,
@@ -1044,13 +1058,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1044 1058
1045#ifdef CONFIG_HUGETLB_PAGE 1059#ifdef CONFIG_HUGETLB_PAGE
1046static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 1060static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
1047 pte_t pte, int offset) 1061 pte_t pte, int offset, int flags2)
1048{ 1062{
1049 if (pte_present(pte)) 1063 if (pte_present(pte))
1050 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) 1064 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
1051 | PM_STATUS2(pm->v2, 0) | PM_PRESENT); 1065 PM_STATUS2(pm->v2, flags2) |
1066 PM_PRESENT);
1052 else 1067 else
1053 *pme = make_pme(PM_NOT_PRESENT(pm->v2)); 1068 *pme = make_pme(PM_NOT_PRESENT(pm->v2) |
1069 PM_STATUS2(pm->v2, flags2));
1054} 1070}
1055 1071
1056/* This function walks within one hugetlb entry in the single call */ 1072/* This function walks within one hugetlb entry in the single call */
@@ -1059,12 +1075,22 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
1059 struct mm_walk *walk) 1075 struct mm_walk *walk)
1060{ 1076{
1061 struct pagemapread *pm = walk->private; 1077 struct pagemapread *pm = walk->private;
1078 struct vm_area_struct *vma;
1062 int err = 0; 1079 int err = 0;
1080 int flags2;
1063 pagemap_entry_t pme; 1081 pagemap_entry_t pme;
1064 1082
1083 vma = find_vma(walk->mm, addr);
1084 WARN_ON_ONCE(!vma);
1085
1086 if (vma && (vma->vm_flags & VM_SOFTDIRTY))
1087 flags2 = __PM_SOFT_DIRTY;
1088 else
1089 flags2 = 0;
1090
1065 for (; addr != end; addr += PAGE_SIZE) { 1091 for (; addr != end; addr += PAGE_SIZE) {
1066 int offset = (addr & ~hmask) >> PAGE_SHIFT; 1092 int offset = (addr & ~hmask) >> PAGE_SHIFT;
1067 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset); 1093 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
1068 err = add_to_pagemap(addr, &pme, pm); 1094 err = add_to_pagemap(addr, &pme, pm);
1069 if (err) 1095 if (err)
1070 return err; 1096 return err;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d2d59b4149d0..dce24569f8fc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -115,6 +115,12 @@ extern unsigned int kobjsize(const void *objp);
115#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ 115#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
116#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ 116#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
117 117
118#ifdef CONFIG_MEM_SOFT_DIRTY
119# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */
120#else
121# define VM_SOFTDIRTY 0
122#endif
123
118#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ 124#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
119#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ 125#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */
120#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ 126#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */
diff --git a/mm/mmap.c b/mm/mmap.c
index 13926a5a6901..51958d192a48 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1609,6 +1609,15 @@ out:
1609 if (file) 1609 if (file)
1610 uprobe_mmap(vma); 1610 uprobe_mmap(vma);
1611 1611
1612 /*
1613 * New (or expanded) vma always get soft dirty status.
1614 * Otherwise user-space soft-dirty page tracker won't
1615 * be able to distinguish situation when vma area unmapped,
1616 * then new mapped in-place (which must be aimed as
1617 * a completely new data area).
1618 */
1619 vma->vm_flags |= VM_SOFTDIRTY;
1620
1612 return addr; 1621 return addr;
1613 1622
1614unmap_and_free_vma: 1623unmap_and_free_vma:
@@ -2652,6 +2661,7 @@ out:
2652 mm->total_vm += len >> PAGE_SHIFT; 2661 mm->total_vm += len >> PAGE_SHIFT;
2653 if (flags & VM_LOCKED) 2662 if (flags & VM_LOCKED)
2654 mm->locked_vm += (len >> PAGE_SHIFT); 2663 mm->locked_vm += (len >> PAGE_SHIFT);
2664 vma->vm_flags |= VM_SOFTDIRTY;
2655 return addr; 2665 return addr;
2656} 2666}
2657 2667
@@ -2916,7 +2926,7 @@ int install_special_mapping(struct mm_struct *mm,
2916 vma->vm_start = addr; 2926 vma->vm_start = addr;
2917 vma->vm_end = addr + len; 2927 vma->vm_end = addr + len;
2918 2928
2919 vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; 2929 vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
2920 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 2930 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
2921 2931
2922 vma->vm_ops = &special_mapping_vmops; 2932 vma->vm_ops = &special_mapping_vmops;