aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Schwidefsky <schwidefsky@de.ibm.com>2012-11-07 07:17:37 -0500
committerMartin Schwidefsky <schwidefsky@de.ibm.com>2013-02-14 09:55:23 -0500
commitabf09bed3cceadd809f0356065c2ada6cee90d4a (patch)
treeb81cac34a4111f498cdef104a2b9c4c444faf0bd
parent486c0a0bc80d370471b21662bf03f04fbb37cdc6 (diff)
s390/mm: implement software dirty bits
The s390 architecture is unique in respect to dirty page detection, it uses the change bit in the per-page storage key to track page modifications. All other architectures track dirty bits by means of page table entries. This property of s390 has caused numerous problems in the past, e.g. see git commit ef5d437f71afdf4a "mm: fix XFS oops due to dirty pages without buffers on s390". To avoid future issues in regard to per-page dirty bits convert s390 to a fault based software dirty bit detection mechanism. All user page table entries which are marked as clean will be hardware read-only, even if the pte is supposed to be writable. A write by the user process will trigger a protection fault which will cause the user pte to be marked as dirty and the hardware read-only bit is removed. With this change the dirty bit in the storage key is irrelevant for Linux as a host, but the storage key is still required for KVM guests. The effect is that page_test_and_clear_dirty and the related code can be removed. The referenced bit in the storage key is still used by the page_test_and_clear_young primitive to provide page age information. For page cache pages of mappings with mapping_cap_account_dirty there will not be any change in behavior as the dirty bit tracking already uses read-only ptes to control the amount of dirty pages. Only for swap cache pages and pages of mappings without mapping_cap_account_dirty there can be additional protection faults. To avoid an excessive number of additional faults the mk_pte primitive checks for PageDirty if the pgprot value allows for writes and pre-dirties the pte. That avoids all additional faults for tmpfs and shmem pages until these pages are added to the swap cache. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
-rw-r--r--arch/s390/include/asm/page.h22
-rw-r--r--arch/s390/include/asm/pgtable.h131
-rw-r--r--arch/s390/include/asm/sclp.h1
-rw-r--r--arch/s390/include/asm/setup.h16
-rw-r--r--arch/s390/kvm/kvm-s390.c2
-rw-r--r--arch/s390/lib/uaccess_pt.c2
-rw-r--r--arch/s390/mm/pageattr.c2
-rw-r--r--arch/s390/mm/vmem.c24
-rw-r--r--drivers/s390/char/sclp_cmd.c10
-rw-r--r--include/asm-generic/pgtable.h10
-rw-r--r--include/linux/page-flags.h8
-rw-r--r--mm/rmap.c24
12 files changed, 112 insertions, 140 deletions
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index a86ad4084073..75ce9b065f9f 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -155,28 +155,6 @@ static inline int page_reset_referenced(unsigned long addr)
155#define _PAGE_ACC_BITS 0xf0 /* HW access control bits */ 155#define _PAGE_ACC_BITS 0xf0 /* HW access control bits */
156 156
157/* 157/*
158 * Test and clear dirty bit in storage key.
159 * We can't clear the changed bit atomically. This is a potential
160 * race against modification of the referenced bit. This function
161 * should therefore only be called if it is not mapped in any
162 * address space.
163 *
164 * Note that the bit gets set whenever page content is changed. That means
165 * also when the page is modified by DMA or from inside the kernel.
166 */
167#define __HAVE_ARCH_PAGE_TEST_AND_CLEAR_DIRTY
168static inline int page_test_and_clear_dirty(unsigned long pfn, int mapped)
169{
170 unsigned char skey;
171
172 skey = page_get_storage_key(pfn << PAGE_SHIFT);
173 if (!(skey & _PAGE_CHANGED))
174 return 0;
175 page_set_storage_key(pfn << PAGE_SHIFT, skey & ~_PAGE_CHANGED, mapped);
176 return 1;
177}
178
179/*
180 * Test and clear referenced bit in storage key. 158 * Test and clear referenced bit in storage key.
181 */ 159 */
182#define __HAVE_ARCH_PAGE_TEST_AND_CLEAR_YOUNG 160#define __HAVE_ARCH_PAGE_TEST_AND_CLEAR_YOUNG
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index a009d4dd70cb..97de1200c849 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -29,6 +29,7 @@
29#ifndef __ASSEMBLY__ 29#ifndef __ASSEMBLY__
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/mm_types.h> 31#include <linux/mm_types.h>
32#include <linux/page-flags.h>
32#include <asm/bug.h> 33#include <asm/bug.h>
33#include <asm/page.h> 34#include <asm/page.h>
34 35
@@ -221,13 +222,15 @@ extern unsigned long MODULES_END;
221/* Software bits in the page table entry */ 222/* Software bits in the page table entry */
222#define _PAGE_SWT 0x001 /* SW pte type bit t */ 223#define _PAGE_SWT 0x001 /* SW pte type bit t */
223#define _PAGE_SWX 0x002 /* SW pte type bit x */ 224#define _PAGE_SWX 0x002 /* SW pte type bit x */
224#define _PAGE_SWC 0x004 /* SW pte changed bit (for KVM) */ 225#define _PAGE_SWC 0x004 /* SW pte changed bit */
225#define _PAGE_SWR 0x008 /* SW pte referenced bit (for KVM) */ 226#define _PAGE_SWR 0x008 /* SW pte referenced bit */
226#define _PAGE_SPECIAL 0x010 /* SW associated with special page */ 227#define _PAGE_SWW 0x010 /* SW pte write bit */
228#define _PAGE_SPECIAL 0x020 /* SW associated with special page */
227#define __HAVE_ARCH_PTE_SPECIAL 229#define __HAVE_ARCH_PTE_SPECIAL
228 230
229/* Set of bits not changed in pte_modify */ 231/* Set of bits not changed in pte_modify */
230#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_SPECIAL | _PAGE_SWC | _PAGE_SWR) 232#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_SPECIAL | _PAGE_CO | \
233 _PAGE_SWC | _PAGE_SWR)
231 234
232/* Six different types of pages. */ 235/* Six different types of pages. */
233#define _PAGE_TYPE_EMPTY 0x400 236#define _PAGE_TYPE_EMPTY 0x400
@@ -321,6 +324,7 @@ extern unsigned long MODULES_END;
321 324
322/* Bits in the region table entry */ 325/* Bits in the region table entry */
323#define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */ 326#define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */
327#define _REGION_ENTRY_RO 0x200 /* region protection bit */
324#define _REGION_ENTRY_INV 0x20 /* invalid region table entry */ 328#define _REGION_ENTRY_INV 0x20 /* invalid region table entry */
325#define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */ 329#define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */
326#define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */ 330#define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */
@@ -382,9 +386,10 @@ extern unsigned long MODULES_END;
382 */ 386 */
383#define PAGE_NONE __pgprot(_PAGE_TYPE_NONE) 387#define PAGE_NONE __pgprot(_PAGE_TYPE_NONE)
384#define PAGE_RO __pgprot(_PAGE_TYPE_RO) 388#define PAGE_RO __pgprot(_PAGE_TYPE_RO)
385#define PAGE_RW __pgprot(_PAGE_TYPE_RW) 389#define PAGE_RW __pgprot(_PAGE_TYPE_RO | _PAGE_SWW)
390#define PAGE_RWC __pgprot(_PAGE_TYPE_RW | _PAGE_SWW | _PAGE_SWC)
386 391
387#define PAGE_KERNEL PAGE_RW 392#define PAGE_KERNEL PAGE_RWC
388#define PAGE_SHARED PAGE_KERNEL 393#define PAGE_SHARED PAGE_KERNEL
389#define PAGE_COPY PAGE_RO 394#define PAGE_COPY PAGE_RO
390 395
@@ -632,23 +637,23 @@ static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste)
632 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 637 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
633 /* Clear page changed & referenced bit in the storage key */ 638 /* Clear page changed & referenced bit in the storage key */
634 if (bits & _PAGE_CHANGED) 639 if (bits & _PAGE_CHANGED)
635 page_set_storage_key(address, skey ^ bits, 1); 640 page_set_storage_key(address, skey ^ bits, 0);
636 else if (bits) 641 else if (bits)
637 page_reset_referenced(address); 642 page_reset_referenced(address);
638 /* Transfer page changed & referenced bit to guest bits in pgste */ 643 /* Transfer page changed & referenced bit to guest bits in pgste */
639 pgste_val(pgste) |= bits << 48; /* RCP_GR_BIT & RCP_GC_BIT */ 644 pgste_val(pgste) |= bits << 48; /* RCP_GR_BIT & RCP_GC_BIT */
640 /* Get host changed & referenced bits from pgste */ 645 /* Get host changed & referenced bits from pgste */
641 bits |= (pgste_val(pgste) & (RCP_HR_BIT | RCP_HC_BIT)) >> 52; 646 bits |= (pgste_val(pgste) & (RCP_HR_BIT | RCP_HC_BIT)) >> 52;
642 /* Clear host bits in pgste. */ 647 /* Transfer page changed & referenced bit to kvm user bits */
648 pgste_val(pgste) |= bits << 45; /* KVM_UR_BIT & KVM_UC_BIT */
649 /* Clear relevant host bits in pgste. */
643 pgste_val(pgste) &= ~(RCP_HR_BIT | RCP_HC_BIT); 650 pgste_val(pgste) &= ~(RCP_HR_BIT | RCP_HC_BIT);
644 pgste_val(pgste) &= ~(RCP_ACC_BITS | RCP_FP_BIT); 651 pgste_val(pgste) &= ~(RCP_ACC_BITS | RCP_FP_BIT);
645 /* Copy page access key and fetch protection bit to pgste */ 652 /* Copy page access key and fetch protection bit to pgste */
646 pgste_val(pgste) |= 653 pgste_val(pgste) |=
647 (unsigned long) (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 654 (unsigned long) (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
648 /* Transfer changed and referenced to kvm user bits */ 655 /* Transfer referenced bit to pte */
649 pgste_val(pgste) |= bits << 45; /* KVM_UR_BIT & KVM_UC_BIT */ 656 pte_val(*ptep) |= (bits & _PAGE_REFERENCED) << 1;
650 /* Transfer changed & referenced to pte sofware bits */
651 pte_val(*ptep) |= bits << 1; /* _PAGE_SWR & _PAGE_SWC */
652#endif 657#endif
653 return pgste; 658 return pgste;
654 659
@@ -661,20 +666,25 @@ static inline pgste_t pgste_update_young(pte_t *ptep, pgste_t pgste)
661 666
662 if (!pte_present(*ptep)) 667 if (!pte_present(*ptep))
663 return pgste; 668 return pgste;
669 /* Get referenced bit from storage key */
664 young = page_reset_referenced(pte_val(*ptep) & PAGE_MASK); 670 young = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
665 /* Transfer page referenced bit to pte software bit (host view) */ 671 if (young)
666 if (young || (pgste_val(pgste) & RCP_HR_BIT)) 672 pgste_val(pgste) |= RCP_GR_BIT;
673 /* Get host referenced bit from pgste */
674 if (pgste_val(pgste) & RCP_HR_BIT) {
675 pgste_val(pgste) &= ~RCP_HR_BIT;
676 young = 1;
677 }
678 /* Transfer referenced bit to kvm user bits and pte */
679 if (young) {
680 pgste_val(pgste) |= KVM_UR_BIT;
667 pte_val(*ptep) |= _PAGE_SWR; 681 pte_val(*ptep) |= _PAGE_SWR;
668 /* Clear host referenced bit in pgste. */ 682 }
669 pgste_val(pgste) &= ~RCP_HR_BIT;
670 /* Transfer page referenced bit to guest bit in pgste */
671 pgste_val(pgste) |= (unsigned long) young << 50; /* set RCP_GR_BIT */
672#endif 683#endif
673 return pgste; 684 return pgste;
674
675} 685}
676 686
677static inline void pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) 687static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry)
678{ 688{
679#ifdef CONFIG_PGSTE 689#ifdef CONFIG_PGSTE
680 unsigned long address; 690 unsigned long address;
@@ -688,10 +698,23 @@ static inline void pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
688 /* Set page access key and fetch protection bit from pgste */ 698 /* Set page access key and fetch protection bit from pgste */
689 nkey |= (pgste_val(pgste) & (RCP_ACC_BITS | RCP_FP_BIT)) >> 56; 699 nkey |= (pgste_val(pgste) & (RCP_ACC_BITS | RCP_FP_BIT)) >> 56;
690 if (okey != nkey) 700 if (okey != nkey)
691 page_set_storage_key(address, nkey, 1); 701 page_set_storage_key(address, nkey, 0);
692#endif 702#endif
693} 703}
694 704
705static inline void pgste_set_pte(pte_t *ptep, pte_t entry)
706{
707 if (!MACHINE_HAS_ESOP && (pte_val(entry) & _PAGE_SWW)) {
708 /*
709 * Without enhanced suppression-on-protection force
710 * the dirty bit on for all writable ptes.
711 */
712 pte_val(entry) |= _PAGE_SWC;
713 pte_val(entry) &= ~_PAGE_RO;
714 }
715 *ptep = entry;
716}
717
695/** 718/**
696 * struct gmap_struct - guest address space 719 * struct gmap_struct - guest address space
697 * @mm: pointer to the parent mm_struct 720 * @mm: pointer to the parent mm_struct
@@ -750,11 +773,14 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
750 773
751 if (mm_has_pgste(mm)) { 774 if (mm_has_pgste(mm)) {
752 pgste = pgste_get_lock(ptep); 775 pgste = pgste_get_lock(ptep);
753 pgste_set_pte(ptep, pgste, entry); 776 pgste_set_key(ptep, pgste, entry);
754 *ptep = entry; 777 pgste_set_pte(ptep, entry);
755 pgste_set_unlock(ptep, pgste); 778 pgste_set_unlock(ptep, pgste);
756 } else 779 } else {
780 if (!(pte_val(entry) & _PAGE_INVALID) && MACHINE_HAS_EDAT1)
781 pte_val(entry) |= _PAGE_CO;
757 *ptep = entry; 782 *ptep = entry;
783 }
758} 784}
759 785
760/* 786/*
@@ -763,16 +789,12 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
763 */ 789 */
764static inline int pte_write(pte_t pte) 790static inline int pte_write(pte_t pte)
765{ 791{
766 return (pte_val(pte) & _PAGE_RO) == 0; 792 return (pte_val(pte) & _PAGE_SWW) != 0;
767} 793}
768 794
769static inline int pte_dirty(pte_t pte) 795static inline int pte_dirty(pte_t pte)
770{ 796{
771#ifdef CONFIG_PGSTE 797 return (pte_val(pte) & _PAGE_SWC) != 0;
772 if (pte_val(pte) & _PAGE_SWC)
773 return 1;
774#endif
775 return 0;
776} 798}
777 799
778static inline int pte_young(pte_t pte) 800static inline int pte_young(pte_t pte)
@@ -822,11 +844,14 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
822{ 844{
823 pte_val(pte) &= _PAGE_CHG_MASK; 845 pte_val(pte) &= _PAGE_CHG_MASK;
824 pte_val(pte) |= pgprot_val(newprot); 846 pte_val(pte) |= pgprot_val(newprot);
847 if ((pte_val(pte) & _PAGE_SWC) && (pte_val(pte) & _PAGE_SWW))
848 pte_val(pte) &= ~_PAGE_RO;
825 return pte; 849 return pte;
826} 850}
827 851
828static inline pte_t pte_wrprotect(pte_t pte) 852static inline pte_t pte_wrprotect(pte_t pte)
829{ 853{
854 pte_val(pte) &= ~_PAGE_SWW;
830 /* Do not clobber _PAGE_TYPE_NONE pages! */ 855 /* Do not clobber _PAGE_TYPE_NONE pages! */
831 if (!(pte_val(pte) & _PAGE_INVALID)) 856 if (!(pte_val(pte) & _PAGE_INVALID))
832 pte_val(pte) |= _PAGE_RO; 857 pte_val(pte) |= _PAGE_RO;
@@ -835,20 +860,26 @@ static inline pte_t pte_wrprotect(pte_t pte)
835 860
836static inline pte_t pte_mkwrite(pte_t pte) 861static inline pte_t pte_mkwrite(pte_t pte)
837{ 862{
838 pte_val(pte) &= ~_PAGE_RO; 863 pte_val(pte) |= _PAGE_SWW;
864 if (pte_val(pte) & _PAGE_SWC)
865 pte_val(pte) &= ~_PAGE_RO;
839 return pte; 866 return pte;
840} 867}
841 868
842static inline pte_t pte_mkclean(pte_t pte) 869static inline pte_t pte_mkclean(pte_t pte)
843{ 870{
844#ifdef CONFIG_PGSTE
845 pte_val(pte) &= ~_PAGE_SWC; 871 pte_val(pte) &= ~_PAGE_SWC;
846#endif 872 /* Do not clobber _PAGE_TYPE_NONE pages! */
873 if (!(pte_val(pte) & _PAGE_INVALID))
874 pte_val(pte) |= _PAGE_RO;
847 return pte; 875 return pte;
848} 876}
849 877
850static inline pte_t pte_mkdirty(pte_t pte) 878static inline pte_t pte_mkdirty(pte_t pte)
851{ 879{
880 pte_val(pte) |= _PAGE_SWC;
881 if (pte_val(pte) & _PAGE_SWW)
882 pte_val(pte) &= ~_PAGE_RO;
852 return pte; 883 return pte;
853} 884}
854 885
@@ -886,10 +917,10 @@ static inline pte_t pte_mkhuge(pte_t pte)
886 pte_val(pte) |= _SEGMENT_ENTRY_INV; 917 pte_val(pte) |= _SEGMENT_ENTRY_INV;
887 } 918 }
888 /* 919 /*
889 * Clear SW pte bits SWT and SWX, there are no SW bits in a segment 920 * Clear SW pte bits, there are no SW bits in a segment table entry.
890 * table entry.
891 */ 921 */
892 pte_val(pte) &= ~(_PAGE_SWT | _PAGE_SWX); 922 pte_val(pte) &= ~(_PAGE_SWT | _PAGE_SWX | _PAGE_SWC |
923 _PAGE_SWR | _PAGE_SWW);
893 /* 924 /*
894 * Also set the change-override bit because we don't need dirty bit 925 * Also set the change-override bit because we don't need dirty bit
895 * tracking for hugetlbfs pages. 926 * tracking for hugetlbfs pages.
@@ -1041,9 +1072,11 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
1041 unsigned long address, 1072 unsigned long address,
1042 pte_t *ptep, pte_t pte) 1073 pte_t *ptep, pte_t pte)
1043{ 1074{
1044 *ptep = pte; 1075 if (mm_has_pgste(mm)) {
1045 if (mm_has_pgste(mm)) 1076 pgste_set_pte(ptep, pte);
1046 pgste_set_unlock(ptep, *(pgste_t *)(ptep + PTRS_PER_PTE)); 1077 pgste_set_unlock(ptep, *(pgste_t *)(ptep + PTRS_PER_PTE));
1078 } else
1079 *ptep = pte;
1047} 1080}
1048 1081
1049#define __HAVE_ARCH_PTEP_CLEAR_FLUSH 1082#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
@@ -1111,10 +1144,13 @@ static inline pte_t ptep_set_wrprotect(struct mm_struct *mm,
1111 1144
1112 if (!mm_exclusive(mm)) 1145 if (!mm_exclusive(mm))
1113 __ptep_ipte(address, ptep); 1146 __ptep_ipte(address, ptep);
1114 *ptep = pte_wrprotect(pte); 1147 pte = pte_wrprotect(pte);
1115 1148
1116 if (mm_has_pgste(mm)) 1149 if (mm_has_pgste(mm)) {
1150 pgste_set_pte(ptep, pte);
1117 pgste_set_unlock(ptep, pgste); 1151 pgste_set_unlock(ptep, pgste);
1152 } else
1153 *ptep = pte;
1118 } 1154 }
1119 return pte; 1155 return pte;
1120} 1156}
@@ -1132,10 +1168,12 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
1132 pgste = pgste_get_lock(ptep); 1168 pgste = pgste_get_lock(ptep);
1133 1169
1134 __ptep_ipte(address, ptep); 1170 __ptep_ipte(address, ptep);
1135 *ptep = entry;
1136 1171
1137 if (mm_has_pgste(vma->vm_mm)) 1172 if (mm_has_pgste(vma->vm_mm)) {
1173 pgste_set_pte(ptep, entry);
1138 pgste_set_unlock(ptep, pgste); 1174 pgste_set_unlock(ptep, pgste);
1175 } else
1176 *ptep = entry;
1139 return 1; 1177 return 1;
1140} 1178}
1141 1179
@@ -1153,8 +1191,13 @@ static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
1153static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) 1191static inline pte_t mk_pte(struct page *page, pgprot_t pgprot)
1154{ 1192{
1155 unsigned long physpage = page_to_phys(page); 1193 unsigned long physpage = page_to_phys(page);
1194 pte_t __pte = mk_pte_phys(physpage, pgprot);
1156 1195
1157 return mk_pte_phys(physpage, pgprot); 1196 if ((pte_val(__pte) & _PAGE_SWW) && PageDirty(page)) {
1197 pte_val(__pte) |= _PAGE_SWC;
1198 pte_val(__pte) &= ~_PAGE_RO;
1199 }
1200 return __pte;
1158} 1201}
1159 1202
1160#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) 1203#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
@@ -1246,6 +1289,8 @@ static inline int pmd_trans_splitting(pmd_t pmd)
1246static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, 1289static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
1247 pmd_t *pmdp, pmd_t entry) 1290 pmd_t *pmdp, pmd_t entry)
1248{ 1291{
1292 if (!(pmd_val(entry) & _SEGMENT_ENTRY_INV) && MACHINE_HAS_EDAT1)
1293 pmd_val(entry) |= _SEGMENT_ENTRY_CO;
1249 *pmdp = entry; 1294 *pmdp = entry;
1250} 1295}
1251 1296
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index 833788693f09..06a136136047 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -46,7 +46,6 @@ int sclp_cpu_deconfigure(u8 cpu);
46void sclp_facilities_detect(void); 46void sclp_facilities_detect(void);
47unsigned long long sclp_get_rnmax(void); 47unsigned long long sclp_get_rnmax(void);
48unsigned long long sclp_get_rzm(void); 48unsigned long long sclp_get_rzm(void);
49u8 sclp_get_fac85(void);
50int sclp_sdias_blk_count(void); 49int sclp_sdias_blk_count(void);
51int sclp_sdias_copy(void *dest, int blk_num, int nr_blks); 50int sclp_sdias_copy(void *dest, int blk_num, int nr_blks);
52int sclp_chp_configure(struct chp_id chpid); 51int sclp_chp_configure(struct chp_id chpid);
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index f69f76b3447a..f6857516e523 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -64,13 +64,14 @@ extern unsigned int s390_user_mode;
64 64
65#define MACHINE_FLAG_VM (1UL << 0) 65#define MACHINE_FLAG_VM (1UL << 0)
66#define MACHINE_FLAG_IEEE (1UL << 1) 66#define MACHINE_FLAG_IEEE (1UL << 1)
67#define MACHINE_FLAG_CSP (1UL << 3) 67#define MACHINE_FLAG_CSP (1UL << 2)
68#define MACHINE_FLAG_MVPG (1UL << 4) 68#define MACHINE_FLAG_MVPG (1UL << 3)
69#define MACHINE_FLAG_DIAG44 (1UL << 5) 69#define MACHINE_FLAG_DIAG44 (1UL << 4)
70#define MACHINE_FLAG_IDTE (1UL << 6) 70#define MACHINE_FLAG_IDTE (1UL << 5)
71#define MACHINE_FLAG_DIAG9C (1UL << 7) 71#define MACHINE_FLAG_DIAG9C (1UL << 6)
72#define MACHINE_FLAG_MVCOS (1UL << 8) 72#define MACHINE_FLAG_MVCOS (1UL << 7)
73#define MACHINE_FLAG_KVM (1UL << 9) 73#define MACHINE_FLAG_KVM (1UL << 8)
74#define MACHINE_FLAG_ESOP (1UL << 9)
74#define MACHINE_FLAG_EDAT1 (1UL << 10) 75#define MACHINE_FLAG_EDAT1 (1UL << 10)
75#define MACHINE_FLAG_EDAT2 (1UL << 11) 76#define MACHINE_FLAG_EDAT2 (1UL << 11)
76#define MACHINE_FLAG_LPAR (1UL << 12) 77#define MACHINE_FLAG_LPAR (1UL << 12)
@@ -84,6 +85,7 @@ extern unsigned int s390_user_mode;
84#define MACHINE_IS_LPAR (S390_lowcore.machine_flags & MACHINE_FLAG_LPAR) 85#define MACHINE_IS_LPAR (S390_lowcore.machine_flags & MACHINE_FLAG_LPAR)
85 86
86#define MACHINE_HAS_DIAG9C (S390_lowcore.machine_flags & MACHINE_FLAG_DIAG9C) 87#define MACHINE_HAS_DIAG9C (S390_lowcore.machine_flags & MACHINE_FLAG_DIAG9C)
88#define MACHINE_HAS_ESOP (S390_lowcore.machine_flags & MACHINE_FLAG_ESOP)
87#define MACHINE_HAS_PFMF MACHINE_HAS_EDAT1 89#define MACHINE_HAS_PFMF MACHINE_HAS_EDAT1
88#define MACHINE_HAS_HPAGE MACHINE_HAS_EDAT1 90#define MACHINE_HAS_HPAGE MACHINE_HAS_EDAT1
89 91
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index f090e819bf71..2923781590a6 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -147,7 +147,7 @@ int kvm_dev_ioctl_check_extension(long ext)
147 r = KVM_MAX_VCPUS; 147 r = KVM_MAX_VCPUS;
148 break; 148 break;
149 case KVM_CAP_S390_COW: 149 case KVM_CAP_S390_COW:
150 r = sclp_get_fac85() & 0x2; 150 r = MACHINE_HAS_ESOP;
151 break; 151 break;
152 default: 152 default:
153 r = 0; 153 r = 0;
diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c
index 9017a63dda3d..a70ee84c0241 100644
--- a/arch/s390/lib/uaccess_pt.c
+++ b/arch/s390/lib/uaccess_pt.c
@@ -50,7 +50,7 @@ static __always_inline unsigned long follow_table(struct mm_struct *mm,
50 ptep = pte_offset_map(pmd, addr); 50 ptep = pte_offset_map(pmd, addr);
51 if (!pte_present(*ptep)) 51 if (!pte_present(*ptep))
52 return -0x11UL; 52 return -0x11UL;
53 if (write && !pte_write(*ptep)) 53 if (write && (!pte_write(*ptep) || !pte_dirty(*ptep)))
54 return -0x04UL; 54 return -0x04UL;
55 55
56 return (pte_val(*ptep) & PAGE_MASK) + (addr & ~PAGE_MASK); 56 return (pte_val(*ptep) & PAGE_MASK) + (addr & ~PAGE_MASK);
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index 29ccee3651f4..d21040ed5e59 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -127,7 +127,7 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
127 pte_val(*pte) = _PAGE_TYPE_EMPTY; 127 pte_val(*pte) = _PAGE_TYPE_EMPTY;
128 continue; 128 continue;
129 } 129 }
130 *pte = mk_pte_phys(address, __pgprot(_PAGE_TYPE_RW)); 130 pte_val(*pte) = __pa(address);
131 } 131 }
132} 132}
133 133
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 6ed1426d27c5..79699f46a443 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -85,11 +85,9 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
85 pud_t *pu_dir; 85 pud_t *pu_dir;
86 pmd_t *pm_dir; 86 pmd_t *pm_dir;
87 pte_t *pt_dir; 87 pte_t *pt_dir;
88 pte_t pte;
89 int ret = -ENOMEM; 88 int ret = -ENOMEM;
90 89
91 while (address < end) { 90 while (address < end) {
92 pte = mk_pte_phys(address, __pgprot(ro ? _PAGE_RO : 0));
93 pg_dir = pgd_offset_k(address); 91 pg_dir = pgd_offset_k(address);
94 if (pgd_none(*pg_dir)) { 92 if (pgd_none(*pg_dir)) {
95 pu_dir = vmem_pud_alloc(); 93 pu_dir = vmem_pud_alloc();
@@ -101,9 +99,9 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
101#if defined(CONFIG_64BIT) && !defined(CONFIG_DEBUG_PAGEALLOC) 99#if defined(CONFIG_64BIT) && !defined(CONFIG_DEBUG_PAGEALLOC)
102 if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address && 100 if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address &&
103 !(address & ~PUD_MASK) && (address + PUD_SIZE <= end)) { 101 !(address & ~PUD_MASK) && (address + PUD_SIZE <= end)) {
104 pte_val(pte) |= _REGION3_ENTRY_LARGE; 102 pud_val(*pu_dir) = __pa(address) |
105 pte_val(pte) |= _REGION_ENTRY_TYPE_R3; 103 _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE |
106 pud_val(*pu_dir) = pte_val(pte); 104 (ro ? _REGION_ENTRY_RO : 0);
107 address += PUD_SIZE; 105 address += PUD_SIZE;
108 continue; 106 continue;
109 } 107 }
@@ -118,8 +116,9 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
118#if defined(CONFIG_64BIT) && !defined(CONFIG_DEBUG_PAGEALLOC) 116#if defined(CONFIG_64BIT) && !defined(CONFIG_DEBUG_PAGEALLOC)
119 if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address && 117 if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address &&
120 !(address & ~PMD_MASK) && (address + PMD_SIZE <= end)) { 118 !(address & ~PMD_MASK) && (address + PMD_SIZE <= end)) {
121 pte_val(pte) |= _SEGMENT_ENTRY_LARGE; 119 pmd_val(*pm_dir) = __pa(address) |
122 pmd_val(*pm_dir) = pte_val(pte); 120 _SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE |
121 (ro ? _SEGMENT_ENTRY_RO : 0);
123 address += PMD_SIZE; 122 address += PMD_SIZE;
124 continue; 123 continue;
125 } 124 }
@@ -132,7 +131,7 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
132 } 131 }
133 132
134 pt_dir = pte_offset_kernel(pm_dir, address); 133 pt_dir = pte_offset_kernel(pm_dir, address);
135 *pt_dir = pte; 134 pte_val(*pt_dir) = __pa(address) | (ro ? _PAGE_RO : 0);
136 address += PAGE_SIZE; 135 address += PAGE_SIZE;
137 } 136 }
138 ret = 0; 137 ret = 0;
@@ -199,7 +198,6 @@ int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node)
199 pud_t *pu_dir; 198 pud_t *pu_dir;
200 pmd_t *pm_dir; 199 pmd_t *pm_dir;
201 pte_t *pt_dir; 200 pte_t *pt_dir;
202 pte_t pte;
203 int ret = -ENOMEM; 201 int ret = -ENOMEM;
204 202
205 start_addr = (unsigned long) start; 203 start_addr = (unsigned long) start;
@@ -237,9 +235,8 @@ int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node)
237 new_page = vmemmap_alloc_block(PMD_SIZE, node); 235 new_page = vmemmap_alloc_block(PMD_SIZE, node);
238 if (!new_page) 236 if (!new_page)
239 goto out; 237 goto out;
240 pte = mk_pte_phys(__pa(new_page), PAGE_RW); 238 pmd_val(*pm_dir) = __pa(new_page) |
241 pte_val(pte) |= _SEGMENT_ENTRY_LARGE; 239 _SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE;
242 pmd_val(*pm_dir) = pte_val(pte);
243 address = (address + PMD_SIZE) & PMD_MASK; 240 address = (address + PMD_SIZE) & PMD_MASK;
244 continue; 241 continue;
245 } 242 }
@@ -260,8 +257,7 @@ int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node)
260 new_page =__pa(vmem_alloc_pages(0)); 257 new_page =__pa(vmem_alloc_pages(0));
261 if (!new_page) 258 if (!new_page)
262 goto out; 259 goto out;
263 pte = pfn_pte(new_page >> PAGE_SHIFT, PAGE_KERNEL); 260 pte_val(*pt_dir) = __pa(new_page);
264 *pt_dir = pte;
265 } 261 }
266 address += PAGE_SIZE; 262 address += PAGE_SIZE;
267 } 263 }
diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c
index c44d13f607bc..30a2255389e5 100644
--- a/drivers/s390/char/sclp_cmd.c
+++ b/drivers/s390/char/sclp_cmd.c
@@ -56,7 +56,6 @@ static int __initdata early_read_info_sccb_valid;
56 56
57u64 sclp_facilities; 57u64 sclp_facilities;
58static u8 sclp_fac84; 58static u8 sclp_fac84;
59static u8 sclp_fac85;
60static unsigned long long rzm; 59static unsigned long long rzm;
61static unsigned long long rnmax; 60static unsigned long long rnmax;
62 61
@@ -131,7 +130,8 @@ void __init sclp_facilities_detect(void)
131 sccb = &early_read_info_sccb; 130 sccb = &early_read_info_sccb;
132 sclp_facilities = sccb->facilities; 131 sclp_facilities = sccb->facilities;
133 sclp_fac84 = sccb->fac84; 132 sclp_fac84 = sccb->fac84;
134 sclp_fac85 = sccb->fac85; 133 if (sccb->fac85 & 0x02)
134 S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP;
135 rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2; 135 rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2;
136 rzm = sccb->rnsize ? sccb->rnsize : sccb->rnsize2; 136 rzm = sccb->rnsize ? sccb->rnsize : sccb->rnsize2;
137 rzm <<= 20; 137 rzm <<= 20;
@@ -171,12 +171,6 @@ unsigned long long sclp_get_rzm(void)
171 return rzm; 171 return rzm;
172} 172}
173 173
174u8 sclp_get_fac85(void)
175{
176 return sclp_fac85;
177}
178EXPORT_SYMBOL_GPL(sclp_get_fac85);
179
180/* 174/*
181 * This function will be called after sclp_facilities_detect(), which gets 175 * This function will be called after sclp_facilities_detect(), which gets
182 * called from early.c code. Therefore the sccb should have valid contents. 176 * called from early.c code. Therefore the sccb should have valid contents.
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 5cf680a98f9b..bfd87685fc1f 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -197,16 +197,6 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
197#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 197#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
198#endif 198#endif
199 199
200#ifndef __HAVE_ARCH_PAGE_TEST_AND_CLEAR_DIRTY
201#define page_test_and_clear_dirty(pfn, mapped) (0)
202#endif
203
204#ifndef __HAVE_ARCH_PAGE_TEST_AND_CLEAR_DIRTY
205#define pte_maybe_dirty(pte) pte_dirty(pte)
206#else
207#define pte_maybe_dirty(pte) (1)
208#endif
209
210#ifndef __HAVE_ARCH_PAGE_TEST_AND_CLEAR_YOUNG 200#ifndef __HAVE_ARCH_PAGE_TEST_AND_CLEAR_YOUNG
211#define page_test_and_clear_young(pfn) (0) 201#define page_test_and_clear_young(pfn) (0)
212#endif 202#endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 70473da47b3f..6d53675c2b54 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -303,21 +303,13 @@ static inline void __SetPageUptodate(struct page *page)
303 303
304static inline void SetPageUptodate(struct page *page) 304static inline void SetPageUptodate(struct page *page)
305{ 305{
306#ifdef CONFIG_S390
307 if (!test_and_set_bit(PG_uptodate, &page->flags))
308 page_set_storage_key(page_to_phys(page), PAGE_DEFAULT_KEY, 0);
309#else
310 /* 306 /*
311 * Memory barrier must be issued before setting the PG_uptodate bit, 307 * Memory barrier must be issued before setting the PG_uptodate bit,
312 * so that all previous stores issued in order to bring the page 308 * so that all previous stores issued in order to bring the page
313 * uptodate are actually visible before PageUptodate becomes true. 309 * uptodate are actually visible before PageUptodate becomes true.
314 *
315 * s390 doesn't need an explicit smp_wmb here because the test and
316 * set bit already provides full barriers.
317 */ 310 */
318 smp_wmb(); 311 smp_wmb();
319 set_bit(PG_uptodate, &(page)->flags); 312 set_bit(PG_uptodate, &(page)->flags);
320#endif
321} 313}
322 314
323CLEARPAGEFLAG(Uptodate, uptodate) 315CLEARPAGEFLAG(Uptodate, uptodate)
diff --git a/mm/rmap.c b/mm/rmap.c
index 2c78f8cadc95..3d38edffda41 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1126,7 +1126,6 @@ void page_add_file_rmap(struct page *page)
1126 */ 1126 */
1127void page_remove_rmap(struct page *page) 1127void page_remove_rmap(struct page *page)
1128{ 1128{
1129 struct address_space *mapping = page_mapping(page);
1130 bool anon = PageAnon(page); 1129 bool anon = PageAnon(page);
1131 bool locked; 1130 bool locked;
1132 unsigned long flags; 1131 unsigned long flags;
@@ -1144,29 +1143,6 @@ void page_remove_rmap(struct page *page)
1144 goto out; 1143 goto out;
1145 1144
1146 /* 1145 /*
1147 * Now that the last pte has gone, s390 must transfer dirty
1148 * flag from storage key to struct page. We can usually skip
1149 * this if the page is anon, so about to be freed; but perhaps
1150 * not if it's in swapcache - there might be another pte slot
1151 * containing the swap entry, but page not yet written to swap.
1152 *
1153 * And we can skip it on file pages, so long as the filesystem
1154 * participates in dirty tracking (note that this is not only an
1155 * optimization but also solves problems caused by dirty flag in
1156 * storage key getting set by a write from inside kernel); but need to
1157 * catch shm and tmpfs and ramfs pages which have been modified since
1158 * creation by read fault.
1159 *
1160 * Note that mapping must be decided above, before decrementing
1161 * mapcount (which luckily provides a barrier): once page is unmapped,
1162 * it could be truncated and page->mapping reset to NULL at any moment.
1163 * Note also that we are relying on page_mapping(page) to set mapping
1164 * to &swapper_space when PageSwapCache(page).
1165 */
1166 if (mapping && !mapping_cap_account_dirty(mapping) &&
1167 page_test_and_clear_dirty(page_to_pfn(page), 1))
1168 set_page_dirty(page);
1169 /*
1170 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED 1146 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
1171 * and not charged by memcg for now. 1147 * and not charged by memcg for now.
1172 */ 1148 */