aboutsummaryrefslogtreecommitdiffstats
path: root/fs/proc/task_mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/proc/task_mmu.c')
-rw-r--r--fs/proc/task_mmu.c168
1 files changed, 133 insertions, 35 deletions
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3e636d864d56..107d026f5d6e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -11,6 +11,7 @@
11#include <linux/rmap.h> 11#include <linux/rmap.h>
12#include <linux/swap.h> 12#include <linux/swap.h>
13#include <linux/swapops.h> 13#include <linux/swapops.h>
14#include <linux/mmu_notifier.h>
14 15
15#include <asm/elf.h> 16#include <asm/elf.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
@@ -688,10 +689,66 @@ const struct file_operations proc_tid_smaps_operations = {
688 .release = seq_release_private, 689 .release = seq_release_private,
689}; 690};
690 691
692/*
693 * We do not want to have constant page-shift bits sitting in
694 * pagemap entries and are about to reuse them some time soon.
695 *
696 * Here's the "migration strategy":
697 * 1. when the system boots these bits remain what they are,
698 * but a warning about future change is printed in log;
699 * 2. once anyone clears soft-dirty bits via clear_refs file,
700 * these flag is set to denote, that user is aware of the
701 * new API and those page-shift bits change their meaning.
702 * The respective warning is printed in dmesg;
703 * 3. In a couple of releases we will remove all the mentions
704 * of page-shift in pagemap entries.
705 */
706
707static bool soft_dirty_cleared __read_mostly;
708
709enum clear_refs_types {
710 CLEAR_REFS_ALL = 1,
711 CLEAR_REFS_ANON,
712 CLEAR_REFS_MAPPED,
713 CLEAR_REFS_SOFT_DIRTY,
714 CLEAR_REFS_LAST,
715};
716
717struct clear_refs_private {
718 struct vm_area_struct *vma;
719 enum clear_refs_types type;
720};
721
722static inline void clear_soft_dirty(struct vm_area_struct *vma,
723 unsigned long addr, pte_t *pte)
724{
725#ifdef CONFIG_MEM_SOFT_DIRTY
726 /*
727 * The soft-dirty tracker uses #PF-s to catch writes
728 * to pages, so write-protect the pte as well. See the
729 * Documentation/vm/soft-dirty.txt for full description
730 * of how soft-dirty works.
731 */
732 pte_t ptent = *pte;
733
734 if (pte_present(ptent)) {
735 ptent = pte_wrprotect(ptent);
736 ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
737 } else if (is_swap_pte(ptent)) {
738 ptent = pte_swp_clear_soft_dirty(ptent);
739 } else if (pte_file(ptent)) {
740 ptent = pte_file_clear_soft_dirty(ptent);
741 }
742
743 set_pte_at(vma->vm_mm, addr, pte, ptent);
744#endif
745}
746
691static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, 747static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
692 unsigned long end, struct mm_walk *walk) 748 unsigned long end, struct mm_walk *walk)
693{ 749{
694 struct vm_area_struct *vma = walk->private; 750 struct clear_refs_private *cp = walk->private;
751 struct vm_area_struct *vma = cp->vma;
695 pte_t *pte, ptent; 752 pte_t *pte, ptent;
696 spinlock_t *ptl; 753 spinlock_t *ptl;
697 struct page *page; 754 struct page *page;
@@ -703,6 +760,12 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
703 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 760 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
704 for (; addr != end; pte++, addr += PAGE_SIZE) { 761 for (; addr != end; pte++, addr += PAGE_SIZE) {
705 ptent = *pte; 762 ptent = *pte;
763
764 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
765 clear_soft_dirty(vma, addr, pte);
766 continue;
767 }
768
706 if (!pte_present(ptent)) 769 if (!pte_present(ptent))
707 continue; 770 continue;
708 771
@@ -719,10 +782,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
719 return 0; 782 return 0;
720} 783}
721 784
722#define CLEAR_REFS_ALL 1
723#define CLEAR_REFS_ANON 2
724#define CLEAR_REFS_MAPPED 3
725
726static ssize_t clear_refs_write(struct file *file, const char __user *buf, 785static ssize_t clear_refs_write(struct file *file, const char __user *buf,
727 size_t count, loff_t *ppos) 786 size_t count, loff_t *ppos)
728{ 787{
@@ -730,7 +789,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
730 char buffer[PROC_NUMBUF]; 789 char buffer[PROC_NUMBUF];
731 struct mm_struct *mm; 790 struct mm_struct *mm;
732 struct vm_area_struct *vma; 791 struct vm_area_struct *vma;
733 int type; 792 enum clear_refs_types type;
793 int itype;
734 int rv; 794 int rv;
735 795
736 memset(buffer, 0, sizeof(buffer)); 796 memset(buffer, 0, sizeof(buffer));
@@ -738,23 +798,37 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
738 count = sizeof(buffer) - 1; 798 count = sizeof(buffer) - 1;
739 if (copy_from_user(buffer, buf, count)) 799 if (copy_from_user(buffer, buf, count))
740 return -EFAULT; 800 return -EFAULT;
741 rv = kstrtoint(strstrip(buffer), 10, &type); 801 rv = kstrtoint(strstrip(buffer), 10, &itype);
742 if (rv < 0) 802 if (rv < 0)
743 return rv; 803 return rv;
744 if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) 804 type = (enum clear_refs_types)itype;
805 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
745 return -EINVAL; 806 return -EINVAL;
807
808 if (type == CLEAR_REFS_SOFT_DIRTY) {
809 soft_dirty_cleared = true;
810 pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
811 "See the linux/Documentation/vm/pagemap.txt for details.\n");
812 }
813
746 task = get_proc_task(file_inode(file)); 814 task = get_proc_task(file_inode(file));
747 if (!task) 815 if (!task)
748 return -ESRCH; 816 return -ESRCH;
749 mm = get_task_mm(task); 817 mm = get_task_mm(task);
750 if (mm) { 818 if (mm) {
819 struct clear_refs_private cp = {
820 .type = type,
821 };
751 struct mm_walk clear_refs_walk = { 822 struct mm_walk clear_refs_walk = {
752 .pmd_entry = clear_refs_pte_range, 823 .pmd_entry = clear_refs_pte_range,
753 .mm = mm, 824 .mm = mm,
825 .private = &cp,
754 }; 826 };
755 down_read(&mm->mmap_sem); 827 down_read(&mm->mmap_sem);
828 if (type == CLEAR_REFS_SOFT_DIRTY)
829 mmu_notifier_invalidate_range_start(mm, 0, -1);
756 for (vma = mm->mmap; vma; vma = vma->vm_next) { 830 for (vma = mm->mmap; vma; vma = vma->vm_next) {
757 clear_refs_walk.private = vma; 831 cp.vma = vma;
758 if (is_vm_hugetlb_page(vma)) 832 if (is_vm_hugetlb_page(vma))
759 continue; 833 continue;
760 /* 834 /*
@@ -773,6 +847,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
773 walk_page_range(vma->vm_start, vma->vm_end, 847 walk_page_range(vma->vm_start, vma->vm_end,
774 &clear_refs_walk); 848 &clear_refs_walk);
775 } 849 }
850 if (type == CLEAR_REFS_SOFT_DIRTY)
851 mmu_notifier_invalidate_range_end(mm, 0, -1);
776 flush_tlb_mm(mm); 852 flush_tlb_mm(mm);
777 up_read(&mm->mmap_sem); 853 up_read(&mm->mmap_sem);
778 mmput(mm); 854 mmput(mm);
@@ -792,14 +868,15 @@ typedef struct {
792} pagemap_entry_t; 868} pagemap_entry_t;
793 869
794struct pagemapread { 870struct pagemapread {
795 int pos, len; 871 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
796 pagemap_entry_t *buffer; 872 pagemap_entry_t *buffer;
873 bool v2;
797}; 874};
798 875
799#define PAGEMAP_WALK_SIZE (PMD_SIZE) 876#define PAGEMAP_WALK_SIZE (PMD_SIZE)
800#define PAGEMAP_WALK_MASK (PMD_MASK) 877#define PAGEMAP_WALK_MASK (PMD_MASK)
801 878
802#define PM_ENTRY_BYTES sizeof(u64) 879#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
803#define PM_STATUS_BITS 3 880#define PM_STATUS_BITS 3
804#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) 881#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
805#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) 882#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
@@ -807,14 +884,17 @@ struct pagemapread {
807#define PM_PSHIFT_BITS 6 884#define PM_PSHIFT_BITS 6
808#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) 885#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
809#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) 886#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
810#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) 887#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
811#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) 888#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
812#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) 889#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
890/* in "new" pagemap pshift bits are occupied with more status bits */
891#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
813 892
893#define __PM_SOFT_DIRTY (1LL)
814#define PM_PRESENT PM_STATUS(4LL) 894#define PM_PRESENT PM_STATUS(4LL)
815#define PM_SWAP PM_STATUS(2LL) 895#define PM_SWAP PM_STATUS(2LL)
816#define PM_FILE PM_STATUS(1LL) 896#define PM_FILE PM_STATUS(1LL)
817#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) 897#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
818#define PM_END_OF_BUFFER 1 898#define PM_END_OF_BUFFER 1
819 899
820static inline pagemap_entry_t make_pme(u64 val) 900static inline pagemap_entry_t make_pme(u64 val)
@@ -837,7 +917,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
837 struct pagemapread *pm = walk->private; 917 struct pagemapread *pm = walk->private;
838 unsigned long addr; 918 unsigned long addr;
839 int err = 0; 919 int err = 0;
840 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); 920 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
841 921
842 for (addr = start; addr < end; addr += PAGE_SIZE) { 922 for (addr = start; addr < end; addr += PAGE_SIZE) {
843 err = add_to_pagemap(addr, &pme, pm); 923 err = add_to_pagemap(addr, &pme, pm);
@@ -847,38 +927,43 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
847 return err; 927 return err;
848} 928}
849 929
850static void pte_to_pagemap_entry(pagemap_entry_t *pme, 930static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
851 struct vm_area_struct *vma, unsigned long addr, pte_t pte) 931 struct vm_area_struct *vma, unsigned long addr, pte_t pte)
852{ 932{
853 u64 frame, flags; 933 u64 frame, flags;
854 struct page *page = NULL; 934 struct page *page = NULL;
935 int flags2 = 0;
855 936
856 if (pte_present(pte)) { 937 if (pte_present(pte)) {
857 frame = pte_pfn(pte); 938 frame = pte_pfn(pte);
858 flags = PM_PRESENT; 939 flags = PM_PRESENT;
859 page = vm_normal_page(vma, addr, pte); 940 page = vm_normal_page(vma, addr, pte);
860 } else if (is_swap_pte(pte)) { 941 } else if (is_swap_pte(pte)) {
861 swp_entry_t entry = pte_to_swp_entry(pte); 942 swp_entry_t entry;
862 943 if (pte_swp_soft_dirty(pte))
944 flags2 |= __PM_SOFT_DIRTY;
945 entry = pte_to_swp_entry(pte);
863 frame = swp_type(entry) | 946 frame = swp_type(entry) |
864 (swp_offset(entry) << MAX_SWAPFILES_SHIFT); 947 (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
865 flags = PM_SWAP; 948 flags = PM_SWAP;
866 if (is_migration_entry(entry)) 949 if (is_migration_entry(entry))
867 page = migration_entry_to_page(entry); 950 page = migration_entry_to_page(entry);
868 } else { 951 } else {
869 *pme = make_pme(PM_NOT_PRESENT); 952 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
870 return; 953 return;
871 } 954 }
872 955
873 if (page && !PageAnon(page)) 956 if (page && !PageAnon(page))
874 flags |= PM_FILE; 957 flags |= PM_FILE;
958 if (pte_soft_dirty(pte))
959 flags2 |= __PM_SOFT_DIRTY;
875 960
876 *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags); 961 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
877} 962}
878 963
879#ifdef CONFIG_TRANSPARENT_HUGEPAGE 964#ifdef CONFIG_TRANSPARENT_HUGEPAGE
880static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, 965static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
881 pmd_t pmd, int offset) 966 pmd_t pmd, int offset, int pmd_flags2)
882{ 967{
883 /* 968 /*
884 * Currently pmd for thp is always present because thp can not be 969 * Currently pmd for thp is always present because thp can not be
@@ -887,13 +972,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
887 */ 972 */
888 if (pmd_present(pmd)) 973 if (pmd_present(pmd))
889 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) 974 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
890 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); 975 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
891 else 976 else
892 *pme = make_pme(PM_NOT_PRESENT); 977 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
893} 978}
894#else 979#else
895static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, 980static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
896 pmd_t pmd, int offset) 981 pmd_t pmd, int offset, int pmd_flags2)
897{ 982{
898} 983}
899#endif 984#endif
@@ -905,17 +990,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
905 struct pagemapread *pm = walk->private; 990 struct pagemapread *pm = walk->private;
906 pte_t *pte; 991 pte_t *pte;
907 int err = 0; 992 int err = 0;
908 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); 993 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
909 994
910 /* find the first VMA at or above 'addr' */ 995 /* find the first VMA at or above 'addr' */
911 vma = find_vma(walk->mm, addr); 996 vma = find_vma(walk->mm, addr);
912 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { 997 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
998 int pmd_flags2;
999
1000 pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
913 for (; addr != end; addr += PAGE_SIZE) { 1001 for (; addr != end; addr += PAGE_SIZE) {
914 unsigned long offset; 1002 unsigned long offset;
915 1003
916 offset = (addr & ~PAGEMAP_WALK_MASK) >> 1004 offset = (addr & ~PAGEMAP_WALK_MASK) >>
917 PAGE_SHIFT; 1005 PAGE_SHIFT;
918 thp_pmd_to_pagemap_entry(&pme, *pmd, offset); 1006 thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
919 err = add_to_pagemap(addr, &pme, pm); 1007 err = add_to_pagemap(addr, &pme, pm);
920 if (err) 1008 if (err)
921 break; 1009 break;
@@ -932,7 +1020,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
932 * and need a new, higher one */ 1020 * and need a new, higher one */
933 if (vma && (addr >= vma->vm_end)) { 1021 if (vma && (addr >= vma->vm_end)) {
934 vma = find_vma(walk->mm, addr); 1022 vma = find_vma(walk->mm, addr);
935 pme = make_pme(PM_NOT_PRESENT); 1023 pme = make_pme(PM_NOT_PRESENT(pm->v2));
936 } 1024 }
937 1025
938 /* check that 'vma' actually covers this address, 1026 /* check that 'vma' actually covers this address,
@@ -940,7 +1028,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
940 if (vma && (vma->vm_start <= addr) && 1028 if (vma && (vma->vm_start <= addr) &&
941 !is_vm_hugetlb_page(vma)) { 1029 !is_vm_hugetlb_page(vma)) {
942 pte = pte_offset_map(pmd, addr); 1030 pte = pte_offset_map(pmd, addr);
943 pte_to_pagemap_entry(&pme, vma, addr, *pte); 1031 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
944 /* unmap before userspace copy */ 1032 /* unmap before userspace copy */
945 pte_unmap(pte); 1033 pte_unmap(pte);
946 } 1034 }
@@ -955,14 +1043,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
955} 1043}
956 1044
957#ifdef CONFIG_HUGETLB_PAGE 1045#ifdef CONFIG_HUGETLB_PAGE
958static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, 1046static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
959 pte_t pte, int offset) 1047 pte_t pte, int offset)
960{ 1048{
961 if (pte_present(pte)) 1049 if (pte_present(pte))
962 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) 1050 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
963 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); 1051 | PM_STATUS2(pm->v2, 0) | PM_PRESENT);
964 else 1052 else
965 *pme = make_pme(PM_NOT_PRESENT); 1053 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
966} 1054}
967 1055
968/* This function walks within one hugetlb entry in the single call */ 1056/* This function walks within one hugetlb entry in the single call */
@@ -976,7 +1064,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
976 1064
977 for (; addr != end; addr += PAGE_SIZE) { 1065 for (; addr != end; addr += PAGE_SIZE) {
978 int offset = (addr & ~hmask) >> PAGE_SHIFT; 1066 int offset = (addr & ~hmask) >> PAGE_SHIFT;
979 huge_pte_to_pagemap_entry(&pme, *pte, offset); 1067 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset);
980 err = add_to_pagemap(addr, &pme, pm); 1068 err = add_to_pagemap(addr, &pme, pm);
981 if (err) 1069 if (err)
982 return err; 1070 return err;
@@ -1038,8 +1126,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1038 if (!count) 1126 if (!count)
1039 goto out_task; 1127 goto out_task;
1040 1128
1041 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); 1129 pm.v2 = soft_dirty_cleared;
1042 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); 1130 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
1131 pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
1043 ret = -ENOMEM; 1132 ret = -ENOMEM;
1044 if (!pm.buffer) 1133 if (!pm.buffer)
1045 goto out_task; 1134 goto out_task;
@@ -1110,9 +1199,18 @@ out:
1110 return ret; 1199 return ret;
1111} 1200}
1112 1201
1202static int pagemap_open(struct inode *inode, struct file *file)
1203{
1204 pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
1205 "to stop being page-shift some time soon. See the "
1206 "linux/Documentation/vm/pagemap.txt for details.\n");
1207 return 0;
1208}
1209
1113const struct file_operations proc_pagemap_operations = { 1210const struct file_operations proc_pagemap_operations = {
1114 .llseek = mem_lseek, /* borrow this */ 1211 .llseek = mem_lseek, /* borrow this */
1115 .read = pagemap_read, 1212 .read = pagemap_read,
1213 .open = pagemap_open,
1116}; 1214};
1117#endif /* CONFIG_PROC_PAGE_MONITOR */ 1215#endif /* CONFIG_PROC_PAGE_MONITOR */
1118 1216