diff options
author | Martin Schwidefsky <schwidefsky@de.ibm.com> | 2015-04-15 07:23:26 -0400 |
---|---|---|
committer | Martin Schwidefsky <schwidefsky@de.ibm.com> | 2015-04-23 10:55:49 -0400 |
commit | 0b46e0a3ec0d7a04af6a091354f1b5e1b952d70a (patch) | |
tree | 011ed5650974aad0df2e6a6c9952e1fc2f935dec /arch/s390 | |
parent | 7e01b5acd88b3f3108d8c4ce44e3205d67437202 (diff) |
s390/kvm: remove delayed reallocation of page tables for KVM
Replacing a 2K page table with a 4K page table while a VMA is active
for the affected memory region is fundamentally broken. Rip out the
page table reallocation code and replace it with a simple system
control 'vm.allocate_pgste'. If the system control is set the page
tables for all processes are allocated as full 4K pages, even for
processes that do not need it.
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Diffstat (limited to 'arch/s390')
-rw-r--r-- | arch/s390/include/asm/mmu.h | 4 | ||||
-rw-r--r-- | arch/s390/include/asm/mmu_context.h | 3 | ||||
-rw-r--r-- | arch/s390/include/asm/pgalloc.h | 1 | ||||
-rw-r--r-- | arch/s390/include/asm/pgtable.h | 9 | ||||
-rw-r--r-- | arch/s390/mm/pgtable.c | 142 |
5 files changed, 59 insertions, 100 deletions
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index a5e656260a70..d29ad9545b41 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h | |||
@@ -14,7 +14,9 @@ typedef struct { | |||
14 | unsigned long asce_bits; | 14 | unsigned long asce_bits; |
15 | unsigned long asce_limit; | 15 | unsigned long asce_limit; |
16 | unsigned long vdso_base; | 16 | unsigned long vdso_base; |
17 | /* The mmu context has extended page tables. */ | 17 | /* The mmu context allocates 4K page tables. */ |
18 | unsigned int alloc_pgste:1; | ||
19 | /* The mmu context uses extended page tables. */ | ||
18 | unsigned int has_pgste:1; | 20 | unsigned int has_pgste:1; |
19 | /* The mmu context uses storage keys. */ | 21 | /* The mmu context uses storage keys. */ |
20 | unsigned int use_skey:1; | 22 | unsigned int use_skey:1; |
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index d25d9ff10ba8..fb1b93ea3e3f 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h | |||
@@ -20,8 +20,11 @@ static inline int init_new_context(struct task_struct *tsk, | |||
20 | mm->context.flush_mm = 0; | 20 | mm->context.flush_mm = 0; |
21 | mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS; | 21 | mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS; |
22 | mm->context.asce_bits |= _ASCE_TYPE_REGION3; | 22 | mm->context.asce_bits |= _ASCE_TYPE_REGION3; |
23 | #ifdef CONFIG_PGSTE | ||
24 | mm->context.alloc_pgste = page_table_allocate_pgste; | ||
23 | mm->context.has_pgste = 0; | 25 | mm->context.has_pgste = 0; |
24 | mm->context.use_skey = 0; | 26 | mm->context.use_skey = 0; |
27 | #endif | ||
25 | mm->context.asce_limit = STACK_TOP_MAX; | 28 | mm->context.asce_limit = STACK_TOP_MAX; |
26 | crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); | 29 | crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); |
27 | return 0; | 30 | return 0; |
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 51e7fb634ebc..7b7858f158b4 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h | |||
@@ -21,6 +21,7 @@ void crst_table_free(struct mm_struct *, unsigned long *); | |||
21 | unsigned long *page_table_alloc(struct mm_struct *); | 21 | unsigned long *page_table_alloc(struct mm_struct *); |
22 | void page_table_free(struct mm_struct *, unsigned long *); | 22 | void page_table_free(struct mm_struct *, unsigned long *); |
23 | void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); | 23 | void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); |
24 | extern int page_table_allocate_pgste; | ||
24 | 25 | ||
25 | int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, | 26 | int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, |
26 | unsigned long key, bool nq); | 27 | unsigned long key, bool nq); |
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 989cfae9e202..1fba63997d50 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h | |||
@@ -423,6 +423,15 @@ static inline int mm_has_pgste(struct mm_struct *mm) | |||
423 | return 0; | 423 | return 0; |
424 | } | 424 | } |
425 | 425 | ||
426 | static inline int mm_alloc_pgste(struct mm_struct *mm) | ||
427 | { | ||
428 | #ifdef CONFIG_PGSTE | ||
429 | if (unlikely(mm->context.alloc_pgste)) | ||
430 | return 1; | ||
431 | #endif | ||
432 | return 0; | ||
433 | } | ||
434 | |||
426 | /* | 435 | /* |
427 | * In the case that a guest uses storage keys | 436 | * In the case that a guest uses storage keys |
428 | * faults should no longer be backed by zero pages | 437 | * faults should no longer be backed by zero pages |
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 33f589459113..b33f66110ca9 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/rcupdate.h> | 18 | #include <linux/rcupdate.h> |
19 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
20 | #include <linux/swapops.h> | 20 | #include <linux/swapops.h> |
21 | #include <linux/sysctl.h> | ||
21 | #include <linux/ksm.h> | 22 | #include <linux/ksm.h> |
22 | #include <linux/mman.h> | 23 | #include <linux/mman.h> |
23 | 24 | ||
@@ -920,6 +921,40 @@ unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) | |||
920 | } | 921 | } |
921 | EXPORT_SYMBOL(get_guest_storage_key); | 922 | EXPORT_SYMBOL(get_guest_storage_key); |
922 | 923 | ||
924 | static int page_table_allocate_pgste_min = 0; | ||
925 | static int page_table_allocate_pgste_max = 1; | ||
926 | int page_table_allocate_pgste = 0; | ||
927 | EXPORT_SYMBOL(page_table_allocate_pgste); | ||
928 | |||
929 | static struct ctl_table page_table_sysctl[] = { | ||
930 | { | ||
931 | .procname = "allocate_pgste", | ||
932 | .data = &page_table_allocate_pgste, | ||
933 | .maxlen = sizeof(int), | ||
934 | .mode = S_IRUGO | S_IWUSR, | ||
935 | .proc_handler = proc_dointvec, | ||
936 | .extra1 = &page_table_allocate_pgste_min, | ||
937 | .extra2 = &page_table_allocate_pgste_max, | ||
938 | }, | ||
939 | { } | ||
940 | }; | ||
941 | |||
942 | static struct ctl_table page_table_sysctl_dir[] = { | ||
943 | { | ||
944 | .procname = "vm", | ||
945 | .maxlen = 0, | ||
946 | .mode = 0555, | ||
947 | .child = page_table_sysctl, | ||
948 | }, | ||
949 | { } | ||
950 | }; | ||
951 | |||
952 | static int __init page_table_register_sysctl(void) | ||
953 | { | ||
954 | return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM; | ||
955 | } | ||
956 | __initcall(page_table_register_sysctl); | ||
957 | |||
923 | #else /* CONFIG_PGSTE */ | 958 | #else /* CONFIG_PGSTE */ |
924 | 959 | ||
925 | static inline int page_table_with_pgste(struct page *page) | 960 | static inline int page_table_with_pgste(struct page *page) |
@@ -963,7 +998,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) | |||
963 | struct page *uninitialized_var(page); | 998 | struct page *uninitialized_var(page); |
964 | unsigned int mask, bit; | 999 | unsigned int mask, bit; |
965 | 1000 | ||
966 | if (mm_has_pgste(mm)) | 1001 | if (mm_alloc_pgste(mm)) |
967 | return page_table_alloc_pgste(mm); | 1002 | return page_table_alloc_pgste(mm); |
968 | /* Allocate fragments of a 4K page as 1K/2K page table */ | 1003 | /* Allocate fragments of a 4K page as 1K/2K page table */ |
969 | spin_lock_bh(&mm->context.list_lock); | 1004 | spin_lock_bh(&mm->context.list_lock); |
@@ -1165,116 +1200,25 @@ static inline void thp_split_mm(struct mm_struct *mm) | |||
1165 | } | 1200 | } |
1166 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 1201 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
1167 | 1202 | ||
1168 | static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb, | ||
1169 | struct mm_struct *mm, pud_t *pud, | ||
1170 | unsigned long addr, unsigned long end) | ||
1171 | { | ||
1172 | unsigned long next, *table, *new; | ||
1173 | struct page *page; | ||
1174 | spinlock_t *ptl; | ||
1175 | pmd_t *pmd; | ||
1176 | |||
1177 | pmd = pmd_offset(pud, addr); | ||
1178 | do { | ||
1179 | next = pmd_addr_end(addr, end); | ||
1180 | again: | ||
1181 | if (pmd_none_or_clear_bad(pmd)) | ||
1182 | continue; | ||
1183 | table = (unsigned long *) pmd_deref(*pmd); | ||
1184 | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); | ||
1185 | if (page_table_with_pgste(page)) | ||
1186 | continue; | ||
1187 | /* Allocate new page table with pgstes */ | ||
1188 | new = page_table_alloc_pgste(mm); | ||
1189 | if (!new) | ||
1190 | return -ENOMEM; | ||
1191 | |||
1192 | ptl = pmd_lock(mm, pmd); | ||
1193 | if (likely((unsigned long *) pmd_deref(*pmd) == table)) { | ||
1194 | /* Nuke pmd entry pointing to the "short" page table */ | ||
1195 | pmdp_flush_lazy(mm, addr, pmd); | ||
1196 | pmd_clear(pmd); | ||
1197 | /* Copy ptes from old table to new table */ | ||
1198 | memcpy(new, table, PAGE_SIZE/2); | ||
1199 | clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); | ||
1200 | /* Establish new table */ | ||
1201 | pmd_populate(mm, pmd, (pte_t *) new); | ||
1202 | /* Free old table with rcu, there might be a walker! */ | ||
1203 | page_table_free_rcu(tlb, table, addr); | ||
1204 | new = NULL; | ||
1205 | } | ||
1206 | spin_unlock(ptl); | ||
1207 | if (new) { | ||
1208 | page_table_free_pgste(new); | ||
1209 | goto again; | ||
1210 | } | ||
1211 | } while (pmd++, addr = next, addr != end); | ||
1212 | |||
1213 | return addr; | ||
1214 | } | ||
1215 | |||
1216 | static unsigned long page_table_realloc_pud(struct mmu_gather *tlb, | ||
1217 | struct mm_struct *mm, pgd_t *pgd, | ||
1218 | unsigned long addr, unsigned long end) | ||
1219 | { | ||
1220 | unsigned long next; | ||
1221 | pud_t *pud; | ||
1222 | |||
1223 | pud = pud_offset(pgd, addr); | ||
1224 | do { | ||
1225 | next = pud_addr_end(addr, end); | ||
1226 | if (pud_none_or_clear_bad(pud)) | ||
1227 | continue; | ||
1228 | next = page_table_realloc_pmd(tlb, mm, pud, addr, next); | ||
1229 | if (unlikely(IS_ERR_VALUE(next))) | ||
1230 | return next; | ||
1231 | } while (pud++, addr = next, addr != end); | ||
1232 | |||
1233 | return addr; | ||
1234 | } | ||
1235 | |||
1236 | static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm, | ||
1237 | unsigned long addr, unsigned long end) | ||
1238 | { | ||
1239 | unsigned long next; | ||
1240 | pgd_t *pgd; | ||
1241 | |||
1242 | pgd = pgd_offset(mm, addr); | ||
1243 | do { | ||
1244 | next = pgd_addr_end(addr, end); | ||
1245 | if (pgd_none_or_clear_bad(pgd)) | ||
1246 | continue; | ||
1247 | next = page_table_realloc_pud(tlb, mm, pgd, addr, next); | ||
1248 | if (unlikely(IS_ERR_VALUE(next))) | ||
1249 | return next; | ||
1250 | } while (pgd++, addr = next, addr != end); | ||
1251 | |||
1252 | return 0; | ||
1253 | } | ||
1254 | |||
1255 | /* | 1203 | /* |
1256 | * switch on pgstes for its userspace process (for kvm) | 1204 | * switch on pgstes for its userspace process (for kvm) |
1257 | */ | 1205 | */ |
1258 | int s390_enable_sie(void) | 1206 | int s390_enable_sie(void) |
1259 | { | 1207 | { |
1260 | struct task_struct *tsk = current; | 1208 | struct mm_struct *mm = current->mm; |
1261 | struct mm_struct *mm = tsk->mm; | ||
1262 | struct mmu_gather tlb; | ||
1263 | 1209 | ||
1264 | /* Do we have pgstes? if yes, we are done */ | 1210 | /* Do we have pgstes? if yes, we are done */ |
1265 | if (mm_has_pgste(tsk->mm)) | 1211 | if (mm_has_pgste(mm)) |
1266 | return 0; | 1212 | return 0; |
1267 | 1213 | /* Fail if the page tables are 2K */ | |
1214 | if (!mm_alloc_pgste(mm)) | ||
1215 | return -EINVAL; | ||
1268 | down_write(&mm->mmap_sem); | 1216 | down_write(&mm->mmap_sem); |
1217 | mm->context.has_pgste = 1; | ||
1269 | /* split thp mappings and disable thp for future mappings */ | 1218 | /* split thp mappings and disable thp for future mappings */ |
1270 | thp_split_mm(mm); | 1219 | thp_split_mm(mm); |
1271 | /* Reallocate the page tables with pgstes */ | ||
1272 | tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE); | ||
1273 | if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE)) | ||
1274 | mm->context.has_pgste = 1; | ||
1275 | tlb_finish_mmu(&tlb, 0, TASK_SIZE); | ||
1276 | up_write(&mm->mmap_sem); | 1220 | up_write(&mm->mmap_sem); |
1277 | return mm->context.has_pgste ? 0 : -ENOMEM; | 1221 | return 0; |
1278 | } | 1222 | } |
1279 | EXPORT_SYMBOL_GPL(s390_enable_sie); | 1223 | EXPORT_SYMBOL_GPL(s390_enable_sie); |
1280 | 1224 | ||