aboutsummaryrefslogtreecommitdiffstats
path: root/fs/proc/task_mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/proc/task_mmu.c')
-rw-r--r--fs/proc/task_mmu.c145
1 files changed, 116 insertions, 29 deletions
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3e636d864d56..dbf61f6174f0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -11,6 +11,7 @@
11#include <linux/rmap.h> 11#include <linux/rmap.h>
12#include <linux/swap.h> 12#include <linux/swap.h>
13#include <linux/swapops.h> 13#include <linux/swapops.h>
14#include <linux/mmu_notifier.h>
14 15
15#include <asm/elf.h> 16#include <asm/elf.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
@@ -688,10 +689,58 @@ const struct file_operations proc_tid_smaps_operations = {
688 .release = seq_release_private, 689 .release = seq_release_private,
689}; 690};
690 691
692/*
693 * We do not want to have constant page-shift bits sitting in
694 * pagemap entries and are about to reuse them some time soon.
695 *
696 * Here's the "migration strategy":
697 * 1. when the system boots these bits remain what they are,
698 * but a warning about future change is printed in log;
699 * 2. once anyone clears soft-dirty bits via clear_refs file,
700 * these flag is set to denote, that user is aware of the
701 * new API and those page-shift bits change their meaning.
702 * The respective warning is printed in dmesg;
703 * 3. In a couple of releases we will remove all the mentions
704 * of page-shift in pagemap entries.
705 */
706
707static bool soft_dirty_cleared __read_mostly;
708
709enum clear_refs_types {
710 CLEAR_REFS_ALL = 1,
711 CLEAR_REFS_ANON,
712 CLEAR_REFS_MAPPED,
713 CLEAR_REFS_SOFT_DIRTY,
714 CLEAR_REFS_LAST,
715};
716
717struct clear_refs_private {
718 struct vm_area_struct *vma;
719 enum clear_refs_types type;
720};
721
722static inline void clear_soft_dirty(struct vm_area_struct *vma,
723 unsigned long addr, pte_t *pte)
724{
725#ifdef CONFIG_MEM_SOFT_DIRTY
726 /*
727 * The soft-dirty tracker uses #PF-s to catch writes
728 * to pages, so write-protect the pte as well. See the
729 * Documentation/vm/soft-dirty.txt for full description
730 * of how soft-dirty works.
731 */
732 pte_t ptent = *pte;
733 ptent = pte_wrprotect(ptent);
734 ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
735 set_pte_at(vma->vm_mm, addr, pte, ptent);
736#endif
737}
738
691static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, 739static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
692 unsigned long end, struct mm_walk *walk) 740 unsigned long end, struct mm_walk *walk)
693{ 741{
694 struct vm_area_struct *vma = walk->private; 742 struct clear_refs_private *cp = walk->private;
743 struct vm_area_struct *vma = cp->vma;
695 pte_t *pte, ptent; 744 pte_t *pte, ptent;
696 spinlock_t *ptl; 745 spinlock_t *ptl;
697 struct page *page; 746 struct page *page;
@@ -706,6 +755,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
706 if (!pte_present(ptent)) 755 if (!pte_present(ptent))
707 continue; 756 continue;
708 757
758 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
759 clear_soft_dirty(vma, addr, pte);
760 continue;
761 }
762
709 page = vm_normal_page(vma, addr, ptent); 763 page = vm_normal_page(vma, addr, ptent);
710 if (!page) 764 if (!page)
711 continue; 765 continue;
@@ -719,10 +773,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
719 return 0; 773 return 0;
720} 774}
721 775
722#define CLEAR_REFS_ALL 1
723#define CLEAR_REFS_ANON 2
724#define CLEAR_REFS_MAPPED 3
725
726static ssize_t clear_refs_write(struct file *file, const char __user *buf, 776static ssize_t clear_refs_write(struct file *file, const char __user *buf,
727 size_t count, loff_t *ppos) 777 size_t count, loff_t *ppos)
728{ 778{
@@ -730,7 +780,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
730 char buffer[PROC_NUMBUF]; 780 char buffer[PROC_NUMBUF];
731 struct mm_struct *mm; 781 struct mm_struct *mm;
732 struct vm_area_struct *vma; 782 struct vm_area_struct *vma;
733 int type; 783 enum clear_refs_types type;
784 int itype;
734 int rv; 785 int rv;
735 786
736 memset(buffer, 0, sizeof(buffer)); 787 memset(buffer, 0, sizeof(buffer));
@@ -738,23 +789,37 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
738 count = sizeof(buffer) - 1; 789 count = sizeof(buffer) - 1;
739 if (copy_from_user(buffer, buf, count)) 790 if (copy_from_user(buffer, buf, count))
740 return -EFAULT; 791 return -EFAULT;
741 rv = kstrtoint(strstrip(buffer), 10, &type); 792 rv = kstrtoint(strstrip(buffer), 10, &itype);
742 if (rv < 0) 793 if (rv < 0)
743 return rv; 794 return rv;
744 if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) 795 type = (enum clear_refs_types)itype;
796 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
745 return -EINVAL; 797 return -EINVAL;
798
799 if (type == CLEAR_REFS_SOFT_DIRTY) {
800 soft_dirty_cleared = true;
801 pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
802 "See the linux/Documentation/vm/pagemap.txt for details.\n");
803 }
804
746 task = get_proc_task(file_inode(file)); 805 task = get_proc_task(file_inode(file));
747 if (!task) 806 if (!task)
748 return -ESRCH; 807 return -ESRCH;
749 mm = get_task_mm(task); 808 mm = get_task_mm(task);
750 if (mm) { 809 if (mm) {
810 struct clear_refs_private cp = {
811 .type = type,
812 };
751 struct mm_walk clear_refs_walk = { 813 struct mm_walk clear_refs_walk = {
752 .pmd_entry = clear_refs_pte_range, 814 .pmd_entry = clear_refs_pte_range,
753 .mm = mm, 815 .mm = mm,
816 .private = &cp,
754 }; 817 };
755 down_read(&mm->mmap_sem); 818 down_read(&mm->mmap_sem);
819 if (type == CLEAR_REFS_SOFT_DIRTY)
820 mmu_notifier_invalidate_range_start(mm, 0, -1);
756 for (vma = mm->mmap; vma; vma = vma->vm_next) { 821 for (vma = mm->mmap; vma; vma = vma->vm_next) {
757 clear_refs_walk.private = vma; 822 cp.vma = vma;
758 if (is_vm_hugetlb_page(vma)) 823 if (is_vm_hugetlb_page(vma))
759 continue; 824 continue;
760 /* 825 /*
@@ -773,6 +838,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
773 walk_page_range(vma->vm_start, vma->vm_end, 838 walk_page_range(vma->vm_start, vma->vm_end,
774 &clear_refs_walk); 839 &clear_refs_walk);
775 } 840 }
841 if (type == CLEAR_REFS_SOFT_DIRTY)
842 mmu_notifier_invalidate_range_end(mm, 0, -1);
776 flush_tlb_mm(mm); 843 flush_tlb_mm(mm);
777 up_read(&mm->mmap_sem); 844 up_read(&mm->mmap_sem);
778 mmput(mm); 845 mmput(mm);
@@ -794,6 +861,7 @@ typedef struct {
794struct pagemapread { 861struct pagemapread {
795 int pos, len; 862 int pos, len;
796 pagemap_entry_t *buffer; 863 pagemap_entry_t *buffer;
864 bool v2;
797}; 865};
798 866
799#define PAGEMAP_WALK_SIZE (PMD_SIZE) 867#define PAGEMAP_WALK_SIZE (PMD_SIZE)
@@ -807,14 +875,17 @@ struct pagemapread {
807#define PM_PSHIFT_BITS 6 875#define PM_PSHIFT_BITS 6
808#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) 876#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
809#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) 877#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
810#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) 878#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
811#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) 879#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
812#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) 880#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
881/* in "new" pagemap pshift bits are occupied with more status bits */
882#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
813 883
884#define __PM_SOFT_DIRTY (1LL)
814#define PM_PRESENT PM_STATUS(4LL) 885#define PM_PRESENT PM_STATUS(4LL)
815#define PM_SWAP PM_STATUS(2LL) 886#define PM_SWAP PM_STATUS(2LL)
816#define PM_FILE PM_STATUS(1LL) 887#define PM_FILE PM_STATUS(1LL)
817#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) 888#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
818#define PM_END_OF_BUFFER 1 889#define PM_END_OF_BUFFER 1
819 890
820static inline pagemap_entry_t make_pme(u64 val) 891static inline pagemap_entry_t make_pme(u64 val)
@@ -837,7 +908,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
837 struct pagemapread *pm = walk->private; 908 struct pagemapread *pm = walk->private;
838 unsigned long addr; 909 unsigned long addr;
839 int err = 0; 910 int err = 0;
840 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); 911 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
841 912
842 for (addr = start; addr < end; addr += PAGE_SIZE) { 913 for (addr = start; addr < end; addr += PAGE_SIZE) {
843 err = add_to_pagemap(addr, &pme, pm); 914 err = add_to_pagemap(addr, &pme, pm);
@@ -847,11 +918,12 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
847 return err; 918 return err;
848} 919}
849 920
850static void pte_to_pagemap_entry(pagemap_entry_t *pme, 921static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
851 struct vm_area_struct *vma, unsigned long addr, pte_t pte) 922 struct vm_area_struct *vma, unsigned long addr, pte_t pte)
852{ 923{
853 u64 frame, flags; 924 u64 frame, flags;
854 struct page *page = NULL; 925 struct page *page = NULL;
926 int flags2 = 0;
855 927
856 if (pte_present(pte)) { 928 if (pte_present(pte)) {
857 frame = pte_pfn(pte); 929 frame = pte_pfn(pte);
@@ -866,19 +938,21 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme,
866 if (is_migration_entry(entry)) 938 if (is_migration_entry(entry))
867 page = migration_entry_to_page(entry); 939 page = migration_entry_to_page(entry);
868 } else { 940 } else {
869 *pme = make_pme(PM_NOT_PRESENT); 941 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
870 return; 942 return;
871 } 943 }
872 944
873 if (page && !PageAnon(page)) 945 if (page && !PageAnon(page))
874 flags |= PM_FILE; 946 flags |= PM_FILE;
947 if (pte_soft_dirty(pte))
948 flags2 |= __PM_SOFT_DIRTY;
875 949
876 *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags); 950 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
877} 951}
878 952
879#ifdef CONFIG_TRANSPARENT_HUGEPAGE 953#ifdef CONFIG_TRANSPARENT_HUGEPAGE
880static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, 954static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
881 pmd_t pmd, int offset) 955 pmd_t pmd, int offset, int pmd_flags2)
882{ 956{
883 /* 957 /*
884 * Currently pmd for thp is always present because thp can not be 958 * Currently pmd for thp is always present because thp can not be
@@ -887,13 +961,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
887 */ 961 */
888 if (pmd_present(pmd)) 962 if (pmd_present(pmd))
889 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) 963 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
890 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); 964 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
891 else 965 else
892 *pme = make_pme(PM_NOT_PRESENT); 966 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
893} 967}
894#else 968#else
895static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, 969static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
896 pmd_t pmd, int offset) 970 pmd_t pmd, int offset, int pmd_flags2)
897{ 971{
898} 972}
899#endif 973#endif
@@ -905,17 +979,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
905 struct pagemapread *pm = walk->private; 979 struct pagemapread *pm = walk->private;
906 pte_t *pte; 980 pte_t *pte;
907 int err = 0; 981 int err = 0;
908 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); 982 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
909 983
910 /* find the first VMA at or above 'addr' */ 984 /* find the first VMA at or above 'addr' */
911 vma = find_vma(walk->mm, addr); 985 vma = find_vma(walk->mm, addr);
912 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { 986 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
987 int pmd_flags2;
988
989 pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
913 for (; addr != end; addr += PAGE_SIZE) { 990 for (; addr != end; addr += PAGE_SIZE) {
914 unsigned long offset; 991 unsigned long offset;
915 992
916 offset = (addr & ~PAGEMAP_WALK_MASK) >> 993 offset = (addr & ~PAGEMAP_WALK_MASK) >>
917 PAGE_SHIFT; 994 PAGE_SHIFT;
918 thp_pmd_to_pagemap_entry(&pme, *pmd, offset); 995 thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
919 err = add_to_pagemap(addr, &pme, pm); 996 err = add_to_pagemap(addr, &pme, pm);
920 if (err) 997 if (err)
921 break; 998 break;
@@ -932,7 +1009,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
932 * and need a new, higher one */ 1009 * and need a new, higher one */
933 if (vma && (addr >= vma->vm_end)) { 1010 if (vma && (addr >= vma->vm_end)) {
934 vma = find_vma(walk->mm, addr); 1011 vma = find_vma(walk->mm, addr);
935 pme = make_pme(PM_NOT_PRESENT); 1012 pme = make_pme(PM_NOT_PRESENT(pm->v2));
936 } 1013 }
937 1014
938 /* check that 'vma' actually covers this address, 1015 /* check that 'vma' actually covers this address,
@@ -940,7 +1017,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
940 if (vma && (vma->vm_start <= addr) && 1017 if (vma && (vma->vm_start <= addr) &&
941 !is_vm_hugetlb_page(vma)) { 1018 !is_vm_hugetlb_page(vma)) {
942 pte = pte_offset_map(pmd, addr); 1019 pte = pte_offset_map(pmd, addr);
943 pte_to_pagemap_entry(&pme, vma, addr, *pte); 1020 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
944 /* unmap before userspace copy */ 1021 /* unmap before userspace copy */
945 pte_unmap(pte); 1022 pte_unmap(pte);
946 } 1023 }
@@ -955,14 +1032,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
955} 1032}
956 1033
957#ifdef CONFIG_HUGETLB_PAGE 1034#ifdef CONFIG_HUGETLB_PAGE
958static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, 1035static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
959 pte_t pte, int offset) 1036 pte_t pte, int offset)
960{ 1037{
961 if (pte_present(pte)) 1038 if (pte_present(pte))
962 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) 1039 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
963 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); 1040 | PM_STATUS2(pm->v2, 0) | PM_PRESENT);
964 else 1041 else
965 *pme = make_pme(PM_NOT_PRESENT); 1042 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
966} 1043}
967 1044
968/* This function walks within one hugetlb entry in the single call */ 1045/* This function walks within one hugetlb entry in the single call */
@@ -976,7 +1053,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
976 1053
977 for (; addr != end; addr += PAGE_SIZE) { 1054 for (; addr != end; addr += PAGE_SIZE) {
978 int offset = (addr & ~hmask) >> PAGE_SHIFT; 1055 int offset = (addr & ~hmask) >> PAGE_SHIFT;
979 huge_pte_to_pagemap_entry(&pme, *pte, offset); 1056 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset);
980 err = add_to_pagemap(addr, &pme, pm); 1057 err = add_to_pagemap(addr, &pme, pm);
981 if (err) 1058 if (err)
982 return err; 1059 return err;
@@ -1038,6 +1115,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1038 if (!count) 1115 if (!count)
1039 goto out_task; 1116 goto out_task;
1040 1117
1118 pm.v2 = soft_dirty_cleared;
1041 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); 1119 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
1042 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); 1120 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
1043 ret = -ENOMEM; 1121 ret = -ENOMEM;
@@ -1110,9 +1188,18 @@ out:
1110 return ret; 1188 return ret;
1111} 1189}
1112 1190
1191static int pagemap_open(struct inode *inode, struct file *file)
1192{
1193 pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
1194 "to stop being page-shift some time soon. See the "
1195 "linux/Documentation/vm/pagemap.txt for details.\n");
1196 return 0;
1197}
1198
1113const struct file_operations proc_pagemap_operations = { 1199const struct file_operations proc_pagemap_operations = {
1114 .llseek = mem_lseek, /* borrow this */ 1200 .llseek = mem_lseek, /* borrow this */
1115 .read = pagemap_read, 1201 .read = pagemap_read,
1202 .open = pagemap_open,
1116}; 1203};
1117#endif /* CONFIG_PROC_PAGE_MONITOR */ 1204#endif /* CONFIG_PROC_PAGE_MONITOR */
1118 1205