aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/proc/task_mmu.c150
-rw-r--r--tools/vm/page-types.c25
2 files changed, 61 insertions, 114 deletions
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 07c86f51d225..41c0a0a500f7 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -712,23 +712,6 @@ const struct file_operations proc_tid_smaps_operations = {
712 .release = proc_map_release, 712 .release = proc_map_release,
713}; 713};
714 714
715/*
716 * We do not want to have constant page-shift bits sitting in
717 * pagemap entries and are about to reuse them some time soon.
718 *
719 * Here's the "migration strategy":
720 * 1. when the system boots these bits remain what they are,
721 * but a warning about future change is printed in log;
722 * 2. once anyone clears soft-dirty bits via clear_refs file,
723 * these flag is set to denote, that user is aware of the
724 * new API and those page-shift bits change their meaning.
725 * The respective warning is printed in dmesg;
726 * 3. In a couple of releases we will remove all the mentions
727 * of page-shift in pagemap entries.
728 */
729
730static bool soft_dirty_cleared __read_mostly;
731
732enum clear_refs_types { 715enum clear_refs_types {
733 CLEAR_REFS_ALL = 1, 716 CLEAR_REFS_ALL = 1,
734 CLEAR_REFS_ANON, 717 CLEAR_REFS_ANON,
@@ -889,13 +872,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
889 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) 872 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
890 return -EINVAL; 873 return -EINVAL;
891 874
892 if (type == CLEAR_REFS_SOFT_DIRTY) {
893 soft_dirty_cleared = true;
894 pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
895 " See the linux/Documentation/vm/pagemap.txt for "
896 "details.\n");
897 }
898
899 task = get_proc_task(file_inode(file)); 875 task = get_proc_task(file_inode(file));
900 if (!task) 876 if (!task)
901 return -ESRCH; 877 return -ESRCH;
@@ -963,36 +939,24 @@ typedef struct {
963struct pagemapread { 939struct pagemapread {
964 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ 940 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
965 pagemap_entry_t *buffer; 941 pagemap_entry_t *buffer;
966 bool v2;
967}; 942};
968 943
969#define PAGEMAP_WALK_SIZE (PMD_SIZE) 944#define PAGEMAP_WALK_SIZE (PMD_SIZE)
970#define PAGEMAP_WALK_MASK (PMD_MASK) 945#define PAGEMAP_WALK_MASK (PMD_MASK)
971 946
972#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) 947#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
973#define PM_STATUS_BITS 3 948#define PM_PFRAME_BITS 55
974#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) 949#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
975#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) 950#define PM_SOFT_DIRTY BIT_ULL(55)
976#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) 951#define PM_FILE BIT_ULL(61)
977#define PM_PSHIFT_BITS 6 952#define PM_SWAP BIT_ULL(62)
978#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) 953#define PM_PRESENT BIT_ULL(63)
979#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) 954
980#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
981#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
982#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
983/* in "new" pagemap pshift bits are occupied with more status bits */
984#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
985
986#define __PM_SOFT_DIRTY (1LL)
987#define PM_PRESENT PM_STATUS(4LL)
988#define PM_SWAP PM_STATUS(2LL)
989#define PM_FILE PM_STATUS(1LL)
990#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
991#define PM_END_OF_BUFFER 1 955#define PM_END_OF_BUFFER 1
992 956
993static inline pagemap_entry_t make_pme(u64 val) 957static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
994{ 958{
995 return (pagemap_entry_t) { .pme = val }; 959 return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
996} 960}
997 961
998static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, 962static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
@@ -1013,7 +977,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
1013 977
1014 while (addr < end) { 978 while (addr < end) {
1015 struct vm_area_struct *vma = find_vma(walk->mm, addr); 979 struct vm_area_struct *vma = find_vma(walk->mm, addr);
1016 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); 980 pagemap_entry_t pme = make_pme(0, 0);
1017 /* End of address space hole, which we mark as non-present. */ 981 /* End of address space hole, which we mark as non-present. */
1018 unsigned long hole_end; 982 unsigned long hole_end;
1019 983
@@ -1033,7 +997,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
1033 997
1034 /* Addresses in the VMA. */ 998 /* Addresses in the VMA. */
1035 if (vma->vm_flags & VM_SOFTDIRTY) 999 if (vma->vm_flags & VM_SOFTDIRTY)
1036 pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY); 1000 pme = make_pme(0, PM_SOFT_DIRTY);
1037 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { 1001 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
1038 err = add_to_pagemap(addr, &pme, pm); 1002 err = add_to_pagemap(addr, &pme, pm);
1039 if (err) 1003 if (err)
@@ -1044,63 +1008,61 @@ out:
1044 return err; 1008 return err;
1045} 1009}
1046 1010
1047static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 1011static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
1048 struct vm_area_struct *vma, unsigned long addr, pte_t pte) 1012 struct vm_area_struct *vma, unsigned long addr, pte_t pte)
1049{ 1013{
1050 u64 frame, flags; 1014 u64 frame = 0, flags = 0;
1051 struct page *page = NULL; 1015 struct page *page = NULL;
1052 int flags2 = 0;
1053 1016
1054 if (pte_present(pte)) { 1017 if (pte_present(pte)) {
1055 frame = pte_pfn(pte); 1018 frame = pte_pfn(pte);
1056 flags = PM_PRESENT; 1019 flags |= PM_PRESENT;
1057 page = vm_normal_page(vma, addr, pte); 1020 page = vm_normal_page(vma, addr, pte);
1058 if (pte_soft_dirty(pte)) 1021 if (pte_soft_dirty(pte))
1059 flags2 |= __PM_SOFT_DIRTY; 1022 flags |= PM_SOFT_DIRTY;
1060 } else if (is_swap_pte(pte)) { 1023 } else if (is_swap_pte(pte)) {
1061 swp_entry_t entry; 1024 swp_entry_t entry;
1062 if (pte_swp_soft_dirty(pte)) 1025 if (pte_swp_soft_dirty(pte))
1063 flags2 |= __PM_SOFT_DIRTY; 1026 flags |= PM_SOFT_DIRTY;
1064 entry = pte_to_swp_entry(pte); 1027 entry = pte_to_swp_entry(pte);
1065 frame = swp_type(entry) | 1028 frame = swp_type(entry) |
1066 (swp_offset(entry) << MAX_SWAPFILES_SHIFT); 1029 (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
1067 flags = PM_SWAP; 1030 flags |= PM_SWAP;
1068 if (is_migration_entry(entry)) 1031 if (is_migration_entry(entry))
1069 page = migration_entry_to_page(entry); 1032 page = migration_entry_to_page(entry);
1070 } else {
1071 if (vma->vm_flags & VM_SOFTDIRTY)
1072 flags2 |= __PM_SOFT_DIRTY;
1073 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
1074 return;
1075 } 1033 }
1076 1034
1077 if (page && !PageAnon(page)) 1035 if (page && !PageAnon(page))
1078 flags |= PM_FILE; 1036 flags |= PM_FILE;
1079 if ((vma->vm_flags & VM_SOFTDIRTY)) 1037 if (vma->vm_flags & VM_SOFTDIRTY)
1080 flags2 |= __PM_SOFT_DIRTY; 1038 flags |= PM_SOFT_DIRTY;
1081 1039
1082 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); 1040 return make_pme(frame, flags);
1083} 1041}
1084 1042
1085#ifdef CONFIG_TRANSPARENT_HUGEPAGE 1043#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1086static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 1044static pagemap_entry_t thp_pmd_to_pagemap_entry(struct pagemapread *pm,
1087 pmd_t pmd, int offset, int pmd_flags2) 1045 pmd_t pmd, int offset, u64 flags)
1088{ 1046{
1047 u64 frame = 0;
1048
1089 /* 1049 /*
1090 * Currently pmd for thp is always present because thp can not be 1050 * Currently pmd for thp is always present because thp can not be
1091 * swapped-out, migrated, or HWPOISONed (split in such cases instead.) 1051 * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
1092 * This if-check is just to prepare for future implementation. 1052 * This if-check is just to prepare for future implementation.
1093 */ 1053 */
1094 if (pmd_present(pmd)) 1054 if (pmd_present(pmd)) {
1095 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) 1055 frame = pmd_pfn(pmd) + offset;
1096 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); 1056 flags |= PM_PRESENT;
1097 else 1057 }
1098 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); 1058
1059 return make_pme(frame, flags);
1099} 1060}
1100#else 1061#else
1101static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 1062static pagemap_entry_t thp_pmd_to_pagemap_entry(struct pagemapread *pm,
1102 pmd_t pmd, int offset, int pmd_flags2) 1063 pmd_t pmd, int offset, u64 flags)
1103{ 1064{
1065 return make_pme(0, 0);
1104} 1066}
1105#endif 1067#endif
1106 1068
@@ -1114,12 +1076,10 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1114 int err = 0; 1076 int err = 0;
1115 1077
1116 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 1078 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1117 int pmd_flags2; 1079 u64 flags = 0;
1118 1080
1119 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) 1081 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
1120 pmd_flags2 = __PM_SOFT_DIRTY; 1082 flags |= PM_SOFT_DIRTY;
1121 else
1122 pmd_flags2 = 0;
1123 1083
1124 for (; addr != end; addr += PAGE_SIZE) { 1084 for (; addr != end; addr += PAGE_SIZE) {
1125 unsigned long offset; 1085 unsigned long offset;
@@ -1127,7 +1087,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1127 1087
1128 offset = (addr & ~PAGEMAP_WALK_MASK) >> 1088 offset = (addr & ~PAGEMAP_WALK_MASK) >>
1129 PAGE_SHIFT; 1089 PAGE_SHIFT;
1130 thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); 1090 pme = thp_pmd_to_pagemap_entry(pm, *pmd, offset, flags);
1131 err = add_to_pagemap(addr, &pme, pm); 1091 err = add_to_pagemap(addr, &pme, pm);
1132 if (err) 1092 if (err)
1133 break; 1093 break;
@@ -1147,7 +1107,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1147 for (; addr < end; pte++, addr += PAGE_SIZE) { 1107 for (; addr < end; pte++, addr += PAGE_SIZE) {
1148 pagemap_entry_t pme; 1108 pagemap_entry_t pme;
1149 1109
1150 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); 1110 pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
1151 err = add_to_pagemap(addr, &pme, pm); 1111 err = add_to_pagemap(addr, &pme, pm);
1152 if (err) 1112 if (err)
1153 break; 1113 break;
@@ -1160,16 +1120,17 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1160} 1120}
1161 1121
1162#ifdef CONFIG_HUGETLB_PAGE 1122#ifdef CONFIG_HUGETLB_PAGE
1163static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 1123static pagemap_entry_t huge_pte_to_pagemap_entry(struct pagemapread *pm,
1164 pte_t pte, int offset, int flags2) 1124 pte_t pte, int offset, u64 flags)
1165{ 1125{
1166 if (pte_present(pte)) 1126 u64 frame = 0;
1167 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | 1127
1168 PM_STATUS2(pm->v2, flags2) | 1128 if (pte_present(pte)) {
1169 PM_PRESENT); 1129 frame = pte_pfn(pte) + offset;
1170 else 1130 flags |= PM_PRESENT;
1171 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | 1131 }
1172 PM_STATUS2(pm->v2, flags2)); 1132
1133 return make_pme(frame, flags);
1173} 1134}
1174 1135
1175/* This function walks within one hugetlb entry in the single call */ 1136/* This function walks within one hugetlb entry in the single call */
@@ -1180,17 +1141,15 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
1180 struct pagemapread *pm = walk->private; 1141 struct pagemapread *pm = walk->private;
1181 struct vm_area_struct *vma = walk->vma; 1142 struct vm_area_struct *vma = walk->vma;
1182 int err = 0; 1143 int err = 0;
1183 int flags2; 1144 u64 flags = 0;
1184 pagemap_entry_t pme; 1145 pagemap_entry_t pme;
1185 1146
1186 if (vma->vm_flags & VM_SOFTDIRTY) 1147 if (vma->vm_flags & VM_SOFTDIRTY)
1187 flags2 = __PM_SOFT_DIRTY; 1148 flags |= PM_SOFT_DIRTY;
1188 else
1189 flags2 = 0;
1190 1149
1191 for (; addr != end; addr += PAGE_SIZE) { 1150 for (; addr != end; addr += PAGE_SIZE) {
1192 int offset = (addr & ~hmask) >> PAGE_SHIFT; 1151 int offset = (addr & ~hmask) >> PAGE_SHIFT;
1193 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); 1152 pme = huge_pte_to_pagemap_entry(pm, *pte, offset, flags);
1194 err = add_to_pagemap(addr, &pme, pm); 1153 err = add_to_pagemap(addr, &pme, pm);
1195 if (err) 1154 if (err)
1196 return err; 1155 return err;
@@ -1211,7 +1170,8 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
1211 * Bits 0-54 page frame number (PFN) if present 1170 * Bits 0-54 page frame number (PFN) if present
1212 * Bits 0-4 swap type if swapped 1171 * Bits 0-4 swap type if swapped
1213 * Bits 5-54 swap offset if swapped 1172 * Bits 5-54 swap offset if swapped
1214 * Bits 55-60 page shift (page size = 1<<page shift) 1173 * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
1174 * Bits 56-60 zero
1215 * Bit 61 page is file-page or shared-anon 1175 * Bit 61 page is file-page or shared-anon
1216 * Bit 62 page swapped 1176 * Bit 62 page swapped
1217 * Bit 63 page present 1177 * Bit 63 page present
@@ -1250,7 +1210,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1250 if (!count) 1210 if (!count)
1251 goto out_mm; 1211 goto out_mm;
1252 1212
1253 pm.v2 = soft_dirty_cleared;
1254 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); 1213 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
1255 pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); 1214 pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
1256 ret = -ENOMEM; 1215 ret = -ENOMEM;
@@ -1323,9 +1282,6 @@ static int pagemap_open(struct inode *inode, struct file *file)
1323 /* do not disclose physical addresses: attack vector */ 1282 /* do not disclose physical addresses: attack vector */
1324 if (!capable(CAP_SYS_ADMIN)) 1283 if (!capable(CAP_SYS_ADMIN))
1325 return -EPERM; 1284 return -EPERM;
1326 pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
1327 "to stop being page-shift some time soon. See the "
1328 "linux/Documentation/vm/pagemap.txt for details.\n");
1329 1285
1330 mm = proc_mem_open(inode, PTRACE_MODE_READ); 1286 mm = proc_mem_open(inode, PTRACE_MODE_READ);
1331 if (IS_ERR(mm)) 1287 if (IS_ERR(mm))
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 8bdf16b8ba60..603ec916716b 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -57,23 +57,14 @@
57 * pagemap kernel ABI bits 57 * pagemap kernel ABI bits
58 */ 58 */
59 59
60#define PM_ENTRY_BYTES sizeof(uint64_t) 60#define PM_ENTRY_BYTES 8
61#define PM_STATUS_BITS 3 61#define PM_PFRAME_BITS 55
62#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) 62#define PM_PFRAME_MASK ((1LL << PM_PFRAME_BITS) - 1)
63#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) 63#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
64#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) 64#define PM_SOFT_DIRTY (1ULL << 55)
65#define PM_PSHIFT_BITS 6 65#define PM_FILE (1ULL << 61)
66#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) 66#define PM_SWAP (1ULL << 62)
67#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) 67#define PM_PRESENT (1ULL << 63)
68#define __PM_PSHIFT(x) (((uint64_t) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
69#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
70#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
71
72#define __PM_SOFT_DIRTY (1LL)
73#define PM_PRESENT PM_STATUS(4LL)
74#define PM_SWAP PM_STATUS(2LL)
75#define PM_SOFT_DIRTY __PM_PSHIFT(__PM_SOFT_DIRTY)
76
77 68
78/* 69/*
79 * kernel page flags 70 * kernel page flags