diff options
-rw-r--r-- | Documentation/vm/soft-dirty.txt | 7 | ||||
-rw-r--r-- | fs/exec.c | 2 | ||||
-rw-r--r-- | fs/proc/task_mmu.c | 46 | ||||
-rw-r--r-- | include/linux/mm.h | 6 | ||||
-rw-r--r-- | mm/mmap.c | 12 |
5 files changed, 61 insertions, 12 deletions
diff --git a/Documentation/vm/soft-dirty.txt b/Documentation/vm/soft-dirty.txt index 9a12a5956bc0..55684d11a1e8 100644 --- a/Documentation/vm/soft-dirty.txt +++ b/Documentation/vm/soft-dirty.txt | |||
@@ -28,6 +28,13 @@ This is so, since the pages are still mapped to physical memory, and thus all | |||
28 | the kernel does is finds this fact out and puts both writable and soft-dirty | 28 | the kernel does is finds this fact out and puts both writable and soft-dirty |
29 | bits on the PTE. | 29 | bits on the PTE. |
30 | 30 | ||
31 | While in most cases tracking memory changes by #PF-s is more than enough | ||
32 | there is still a scenario when we can lose soft dirty bits -- a task | ||
33 | unmaps a previously mapped memory region and then maps a new one at exactly | ||
34 | the same place. When unmap is called, the kernel internally clears PTE values | ||
35 | including soft dirty bits. To notify user space application about such | ||
36 | memory region renewal the kernel always marks new memory regions (and | ||
37 | expanded regions) as soft dirty. | ||
31 | 38 | ||
32 | This feature is actively used by the checkpoint-restore project. You | 39 | This feature is actively used by the checkpoint-restore project. You |
33 | can find more details about it on http://criu.org | 40 | can find more details about it on http://criu.org |
@@ -266,7 +266,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) | |||
266 | BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); | 266 | BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); |
267 | vma->vm_end = STACK_TOP_MAX; | 267 | vma->vm_end = STACK_TOP_MAX; |
268 | vma->vm_start = vma->vm_end - PAGE_SIZE; | 268 | vma->vm_start = vma->vm_end - PAGE_SIZE; |
269 | vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; | 269 | vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; |
270 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); | 270 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); |
271 | INIT_LIST_HEAD(&vma->anon_vma_chain); | 271 | INIT_LIST_HEAD(&vma->anon_vma_chain); |
272 | 272 | ||
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 107d026f5d6e..09228639b83d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -740,6 +740,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, | |||
740 | ptent = pte_file_clear_soft_dirty(ptent); | 740 | ptent = pte_file_clear_soft_dirty(ptent); |
741 | } | 741 | } |
742 | 742 | ||
743 | if (vma->vm_flags & VM_SOFTDIRTY) | ||
744 | vma->vm_flags &= ~VM_SOFTDIRTY; | ||
745 | |||
743 | set_pte_at(vma->vm_mm, addr, pte, ptent); | 746 | set_pte_at(vma->vm_mm, addr, pte, ptent); |
744 | #endif | 747 | #endif |
745 | } | 748 | } |
@@ -949,13 +952,15 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | |||
949 | if (is_migration_entry(entry)) | 952 | if (is_migration_entry(entry)) |
950 | page = migration_entry_to_page(entry); | 953 | page = migration_entry_to_page(entry); |
951 | } else { | 954 | } else { |
952 | *pme = make_pme(PM_NOT_PRESENT(pm->v2)); | 955 | if (vma->vm_flags & VM_SOFTDIRTY) |
956 | flags2 |= __PM_SOFT_DIRTY; | ||
957 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); | ||
953 | return; | 958 | return; |
954 | } | 959 | } |
955 | 960 | ||
956 | if (page && !PageAnon(page)) | 961 | if (page && !PageAnon(page)) |
957 | flags |= PM_FILE; | 962 | flags |= PM_FILE; |
958 | if (pte_soft_dirty(pte)) | 963 | if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte)) |
959 | flags2 |= __PM_SOFT_DIRTY; | 964 | flags2 |= __PM_SOFT_DIRTY; |
960 | 965 | ||
961 | *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); | 966 | *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); |
@@ -974,7 +979,7 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *p | |||
974 | *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) | 979 | *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) |
975 | | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); | 980 | | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); |
976 | else | 981 | else |
977 | *pme = make_pme(PM_NOT_PRESENT(pm->v2)); | 982 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); |
978 | } | 983 | } |
979 | #else | 984 | #else |
980 | static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | 985 | static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, |
@@ -997,7 +1002,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
997 | if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { | 1002 | if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { |
998 | int pmd_flags2; | 1003 | int pmd_flags2; |
999 | 1004 | ||
1000 | pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0); | 1005 | if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) |
1006 | pmd_flags2 = __PM_SOFT_DIRTY; | ||
1007 | else | ||
1008 | pmd_flags2 = 0; | ||
1009 | |||
1001 | for (; addr != end; addr += PAGE_SIZE) { | 1010 | for (; addr != end; addr += PAGE_SIZE) { |
1002 | unsigned long offset; | 1011 | unsigned long offset; |
1003 | 1012 | ||
@@ -1015,12 +1024,17 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
1015 | if (pmd_trans_unstable(pmd)) | 1024 | if (pmd_trans_unstable(pmd)) |
1016 | return 0; | 1025 | return 0; |
1017 | for (; addr != end; addr += PAGE_SIZE) { | 1026 | for (; addr != end; addr += PAGE_SIZE) { |
1027 | int flags2; | ||
1018 | 1028 | ||
1019 | /* check to see if we've left 'vma' behind | 1029 | /* check to see if we've left 'vma' behind |
1020 | * and need a new, higher one */ | 1030 | * and need a new, higher one */ |
1021 | if (vma && (addr >= vma->vm_end)) { | 1031 | if (vma && (addr >= vma->vm_end)) { |
1022 | vma = find_vma(walk->mm, addr); | 1032 | vma = find_vma(walk->mm, addr); |
1023 | pme = make_pme(PM_NOT_PRESENT(pm->v2)); | 1033 | if (vma && (vma->vm_flags & VM_SOFTDIRTY)) |
1034 | flags2 = __PM_SOFT_DIRTY; | ||
1035 | else | ||
1036 | flags2 = 0; | ||
1037 | pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); | ||
1024 | } | 1038 | } |
1025 | 1039 | ||
1026 | /* check that 'vma' actually covers this address, | 1040 | /* check that 'vma' actually covers this address, |
@@ -1044,13 +1058,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
1044 | 1058 | ||
1045 | #ifdef CONFIG_HUGETLB_PAGE | 1059 | #ifdef CONFIG_HUGETLB_PAGE |
1046 | static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | 1060 | static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, |
1047 | pte_t pte, int offset) | 1061 | pte_t pte, int offset, int flags2) |
1048 | { | 1062 | { |
1049 | if (pte_present(pte)) | 1063 | if (pte_present(pte)) |
1050 | *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | 1064 | *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | |
1051 | | PM_STATUS2(pm->v2, 0) | PM_PRESENT); | 1065 | PM_STATUS2(pm->v2, flags2) | |
1066 | PM_PRESENT); | ||
1052 | else | 1067 | else |
1053 | *pme = make_pme(PM_NOT_PRESENT(pm->v2)); | 1068 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | |
1069 | PM_STATUS2(pm->v2, flags2)); | ||
1054 | } | 1070 | } |
1055 | 1071 | ||
1056 | /* This function walks within one hugetlb entry in the single call */ | 1072 | /* This function walks within one hugetlb entry in the single call */ |
@@ -1059,12 +1075,22 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, | |||
1059 | struct mm_walk *walk) | 1075 | struct mm_walk *walk) |
1060 | { | 1076 | { |
1061 | struct pagemapread *pm = walk->private; | 1077 | struct pagemapread *pm = walk->private; |
1078 | struct vm_area_struct *vma; | ||
1062 | int err = 0; | 1079 | int err = 0; |
1080 | int flags2; | ||
1063 | pagemap_entry_t pme; | 1081 | pagemap_entry_t pme; |
1064 | 1082 | ||
1083 | vma = find_vma(walk->mm, addr); | ||
1084 | WARN_ON_ONCE(!vma); | ||
1085 | |||
1086 | if (vma && (vma->vm_flags & VM_SOFTDIRTY)) | ||
1087 | flags2 = __PM_SOFT_DIRTY; | ||
1088 | else | ||
1089 | flags2 = 0; | ||
1090 | |||
1065 | for (; addr != end; addr += PAGE_SIZE) { | 1091 | for (; addr != end; addr += PAGE_SIZE) { |
1066 | int offset = (addr & ~hmask) >> PAGE_SHIFT; | 1092 | int offset = (addr & ~hmask) >> PAGE_SHIFT; |
1067 | huge_pte_to_pagemap_entry(&pme, pm, *pte, offset); | 1093 | huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); |
1068 | err = add_to_pagemap(addr, &pme, pm); | 1094 | err = add_to_pagemap(addr, &pme, pm); |
1069 | if (err) | 1095 | if (err) |
1070 | return err; | 1096 | return err; |
diff --git a/include/linux/mm.h b/include/linux/mm.h index d2d59b4149d0..dce24569f8fc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -115,6 +115,12 @@ extern unsigned int kobjsize(const void *objp); | |||
115 | #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ | 115 | #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ |
116 | #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ | 116 | #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ |
117 | 117 | ||
118 | #ifdef CONFIG_MEM_SOFT_DIRTY | ||
119 | # define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ | ||
120 | #else | ||
121 | # define VM_SOFTDIRTY 0 | ||
122 | #endif | ||
123 | |||
118 | #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ | 124 | #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ |
119 | #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ | 125 | #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ |
120 | #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ | 126 | #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ |
@@ -1609,6 +1609,15 @@ out: | |||
1609 | if (file) | 1609 | if (file) |
1610 | uprobe_mmap(vma); | 1610 | uprobe_mmap(vma); |
1611 | 1611 | ||
1612 | /* | ||
1613 | * New (or expanded) vma always get soft dirty status. | ||
1614 | * Otherwise user-space soft-dirty page tracker won't | ||
1615 | * be able to distinguish situation when vma area unmapped, | ||
1616 | * then new mapped in-place (which must be aimed as | ||
1617 | * a completely new data area). | ||
1618 | */ | ||
1619 | vma->vm_flags |= VM_SOFTDIRTY; | ||
1620 | |||
1612 | return addr; | 1621 | return addr; |
1613 | 1622 | ||
1614 | unmap_and_free_vma: | 1623 | unmap_and_free_vma: |
@@ -2652,6 +2661,7 @@ out: | |||
2652 | mm->total_vm += len >> PAGE_SHIFT; | 2661 | mm->total_vm += len >> PAGE_SHIFT; |
2653 | if (flags & VM_LOCKED) | 2662 | if (flags & VM_LOCKED) |
2654 | mm->locked_vm += (len >> PAGE_SHIFT); | 2663 | mm->locked_vm += (len >> PAGE_SHIFT); |
2664 | vma->vm_flags |= VM_SOFTDIRTY; | ||
2655 | return addr; | 2665 | return addr; |
2656 | } | 2666 | } |
2657 | 2667 | ||
@@ -2916,7 +2926,7 @@ int install_special_mapping(struct mm_struct *mm, | |||
2916 | vma->vm_start = addr; | 2926 | vma->vm_start = addr; |
2917 | vma->vm_end = addr + len; | 2927 | vma->vm_end = addr + len; |
2918 | 2928 | ||
2919 | vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; | 2929 | vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; |
2920 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); | 2930 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); |
2921 | 2931 | ||
2922 | vma->vm_ops = &special_mapping_vmops; | 2932 | vma->vm_ops = &special_mapping_vmops; |