aboutsummaryrefslogtreecommitdiffstats
path: root/arch/s390
diff options
context:
space:
mode:
authorMartin Schwidefsky <schwidefsky@de.ibm.com>2015-04-15 07:23:26 -0400
committerMartin Schwidefsky <schwidefsky@de.ibm.com>2015-04-23 10:55:49 -0400
commit0b46e0a3ec0d7a04af6a091354f1b5e1b952d70a (patch)
tree011ed5650974aad0df2e6a6c9952e1fc2f935dec /arch/s390
parent7e01b5acd88b3f3108d8c4ce44e3205d67437202 (diff)
s390/kvm: remove delayed reallocation of page tables for KVM
Replacing a 2K page table with a 4K page table while a VMA is active for the affected memory region is fundamentally broken. Rip out the page table reallocation code and replace it with a simple system control 'vm.allocate_pgste'. If the system control is set the page tables for all processes are allocated as full 4K pages, even for processes that do not need it. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Diffstat (limited to 'arch/s390')
-rw-r--r--arch/s390/include/asm/mmu.h4
-rw-r--r--arch/s390/include/asm/mmu_context.h3
-rw-r--r--arch/s390/include/asm/pgalloc.h1
-rw-r--r--arch/s390/include/asm/pgtable.h9
-rw-r--r--arch/s390/mm/pgtable.c142
5 files changed, 59 insertions, 100 deletions
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index a5e656260a70..d29ad9545b41 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -14,7 +14,9 @@ typedef struct {
14 unsigned long asce_bits; 14 unsigned long asce_bits;
15 unsigned long asce_limit; 15 unsigned long asce_limit;
16 unsigned long vdso_base; 16 unsigned long vdso_base;
17 /* The mmu context has extended page tables. */ 17 /* The mmu context allocates 4K page tables. */
18 unsigned int alloc_pgste:1;
19 /* The mmu context uses extended page tables. */
18 unsigned int has_pgste:1; 20 unsigned int has_pgste:1;
19 /* The mmu context uses storage keys. */ 21 /* The mmu context uses storage keys. */
20 unsigned int use_skey:1; 22 unsigned int use_skey:1;
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index d25d9ff10ba8..fb1b93ea3e3f 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -20,8 +20,11 @@ static inline int init_new_context(struct task_struct *tsk,
20 mm->context.flush_mm = 0; 20 mm->context.flush_mm = 0;
21 mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS; 21 mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS;
22 mm->context.asce_bits |= _ASCE_TYPE_REGION3; 22 mm->context.asce_bits |= _ASCE_TYPE_REGION3;
23#ifdef CONFIG_PGSTE
24 mm->context.alloc_pgste = page_table_allocate_pgste;
23 mm->context.has_pgste = 0; 25 mm->context.has_pgste = 0;
24 mm->context.use_skey = 0; 26 mm->context.use_skey = 0;
27#endif
25 mm->context.asce_limit = STACK_TOP_MAX; 28 mm->context.asce_limit = STACK_TOP_MAX;
26 crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); 29 crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
27 return 0; 30 return 0;
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 51e7fb634ebc..7b7858f158b4 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -21,6 +21,7 @@ void crst_table_free(struct mm_struct *, unsigned long *);
21unsigned long *page_table_alloc(struct mm_struct *); 21unsigned long *page_table_alloc(struct mm_struct *);
22void page_table_free(struct mm_struct *, unsigned long *); 22void page_table_free(struct mm_struct *, unsigned long *);
23void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); 23void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
24extern int page_table_allocate_pgste;
24 25
25int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 26int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
26 unsigned long key, bool nq); 27 unsigned long key, bool nq);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 989cfae9e202..1fba63997d50 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -423,6 +423,15 @@ static inline int mm_has_pgste(struct mm_struct *mm)
423 return 0; 423 return 0;
424} 424}
425 425
426static inline int mm_alloc_pgste(struct mm_struct *mm)
427{
428#ifdef CONFIG_PGSTE
429 if (unlikely(mm->context.alloc_pgste))
430 return 1;
431#endif
432 return 0;
433}
434
426/* 435/*
427 * In the case that a guest uses storage keys 436 * In the case that a guest uses storage keys
428 * faults should no longer be backed by zero pages 437 * faults should no longer be backed by zero pages
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 33f589459113..b33f66110ca9 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -18,6 +18,7 @@
18#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/swapops.h> 20#include <linux/swapops.h>
21#include <linux/sysctl.h>
21#include <linux/ksm.h> 22#include <linux/ksm.h>
22#include <linux/mman.h> 23#include <linux/mman.h>
23 24
@@ -920,6 +921,40 @@ unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
920} 921}
921EXPORT_SYMBOL(get_guest_storage_key); 922EXPORT_SYMBOL(get_guest_storage_key);
922 923
924static int page_table_allocate_pgste_min = 0;
925static int page_table_allocate_pgste_max = 1;
926int page_table_allocate_pgste = 0;
927EXPORT_SYMBOL(page_table_allocate_pgste);
928
929static struct ctl_table page_table_sysctl[] = {
930 {
931 .procname = "allocate_pgste",
932 .data = &page_table_allocate_pgste,
933 .maxlen = sizeof(int),
934 .mode = S_IRUGO | S_IWUSR,
935 .proc_handler = proc_dointvec,
936 .extra1 = &page_table_allocate_pgste_min,
937 .extra2 = &page_table_allocate_pgste_max,
938 },
939 { }
940};
941
942static struct ctl_table page_table_sysctl_dir[] = {
943 {
944 .procname = "vm",
945 .maxlen = 0,
946 .mode = 0555,
947 .child = page_table_sysctl,
948 },
949 { }
950};
951
952static int __init page_table_register_sysctl(void)
953{
954 return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
955}
956__initcall(page_table_register_sysctl);
957
923#else /* CONFIG_PGSTE */ 958#else /* CONFIG_PGSTE */
924 959
925static inline int page_table_with_pgste(struct page *page) 960static inline int page_table_with_pgste(struct page *page)
@@ -963,7 +998,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
963 struct page *uninitialized_var(page); 998 struct page *uninitialized_var(page);
964 unsigned int mask, bit; 999 unsigned int mask, bit;
965 1000
966 if (mm_has_pgste(mm)) 1001 if (mm_alloc_pgste(mm))
967 return page_table_alloc_pgste(mm); 1002 return page_table_alloc_pgste(mm);
968 /* Allocate fragments of a 4K page as 1K/2K page table */ 1003 /* Allocate fragments of a 4K page as 1K/2K page table */
969 spin_lock_bh(&mm->context.list_lock); 1004 spin_lock_bh(&mm->context.list_lock);
@@ -1165,116 +1200,25 @@ static inline void thp_split_mm(struct mm_struct *mm)
1165} 1200}
1166#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1201#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1167 1202
1168static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
1169 struct mm_struct *mm, pud_t *pud,
1170 unsigned long addr, unsigned long end)
1171{
1172 unsigned long next, *table, *new;
1173 struct page *page;
1174 spinlock_t *ptl;
1175 pmd_t *pmd;
1176
1177 pmd = pmd_offset(pud, addr);
1178 do {
1179 next = pmd_addr_end(addr, end);
1180again:
1181 if (pmd_none_or_clear_bad(pmd))
1182 continue;
1183 table = (unsigned long *) pmd_deref(*pmd);
1184 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1185 if (page_table_with_pgste(page))
1186 continue;
1187 /* Allocate new page table with pgstes */
1188 new = page_table_alloc_pgste(mm);
1189 if (!new)
1190 return -ENOMEM;
1191
1192 ptl = pmd_lock(mm, pmd);
1193 if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
1194 /* Nuke pmd entry pointing to the "short" page table */
1195 pmdp_flush_lazy(mm, addr, pmd);
1196 pmd_clear(pmd);
1197 /* Copy ptes from old table to new table */
1198 memcpy(new, table, PAGE_SIZE/2);
1199 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
1200 /* Establish new table */
1201 pmd_populate(mm, pmd, (pte_t *) new);
1202 /* Free old table with rcu, there might be a walker! */
1203 page_table_free_rcu(tlb, table, addr);
1204 new = NULL;
1205 }
1206 spin_unlock(ptl);
1207 if (new) {
1208 page_table_free_pgste(new);
1209 goto again;
1210 }
1211 } while (pmd++, addr = next, addr != end);
1212
1213 return addr;
1214}
1215
1216static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
1217 struct mm_struct *mm, pgd_t *pgd,
1218 unsigned long addr, unsigned long end)
1219{
1220 unsigned long next;
1221 pud_t *pud;
1222
1223 pud = pud_offset(pgd, addr);
1224 do {
1225 next = pud_addr_end(addr, end);
1226 if (pud_none_or_clear_bad(pud))
1227 continue;
1228 next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
1229 if (unlikely(IS_ERR_VALUE(next)))
1230 return next;
1231 } while (pud++, addr = next, addr != end);
1232
1233 return addr;
1234}
1235
1236static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
1237 unsigned long addr, unsigned long end)
1238{
1239 unsigned long next;
1240 pgd_t *pgd;
1241
1242 pgd = pgd_offset(mm, addr);
1243 do {
1244 next = pgd_addr_end(addr, end);
1245 if (pgd_none_or_clear_bad(pgd))
1246 continue;
1247 next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
1248 if (unlikely(IS_ERR_VALUE(next)))
1249 return next;
1250 } while (pgd++, addr = next, addr != end);
1251
1252 return 0;
1253}
1254
1255/* 1203/*
1256 * switch on pgstes for its userspace process (for kvm) 1204 * switch on pgstes for its userspace process (for kvm)
1257 */ 1205 */
1258int s390_enable_sie(void) 1206int s390_enable_sie(void)
1259{ 1207{
1260 struct task_struct *tsk = current; 1208 struct mm_struct *mm = current->mm;
1261 struct mm_struct *mm = tsk->mm;
1262 struct mmu_gather tlb;
1263 1209
1264 /* Do we have pgstes? if yes, we are done */ 1210 /* Do we have pgstes? if yes, we are done */
1265 if (mm_has_pgste(tsk->mm)) 1211 if (mm_has_pgste(mm))
1266 return 0; 1212 return 0;
1267 1213 /* Fail if the page tables are 2K */
1214 if (!mm_alloc_pgste(mm))
1215 return -EINVAL;
1268 down_write(&mm->mmap_sem); 1216 down_write(&mm->mmap_sem);
1217 mm->context.has_pgste = 1;
1269 /* split thp mappings and disable thp for future mappings */ 1218 /* split thp mappings and disable thp for future mappings */
1270 thp_split_mm(mm); 1219 thp_split_mm(mm);
1271 /* Reallocate the page tables with pgstes */
1272 tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
1273 if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE))
1274 mm->context.has_pgste = 1;
1275 tlb_finish_mmu(&tlb, 0, TASK_SIZE);
1276 up_write(&mm->mmap_sem); 1220 up_write(&mm->mmap_sem);
1277 return mm->context.has_pgste ? 0 : -ENOMEM; 1221 return 0;
1278} 1222}
1279EXPORT_SYMBOL_GPL(s390_enable_sie); 1223EXPORT_SYMBOL_GPL(s390_enable_sie);
1280 1224