aboutsummaryrefslogtreecommitdiffstats
path: root/arch/s390/mm
diff options
context:
space:
mode:
authorMartin Schwidefsky <schwidefsky@de.ibm.com>2016-03-08 05:49:57 -0500
committerMartin Schwidefsky <schwidefsky@de.ibm.com>2016-03-08 09:00:15 -0500
commit1e133ab296f3ff8d9e58a5e758291ed39ba72ad7 (patch)
treeb64997e3dd0979e6e4bc26ca66c39ace0ac83589 /arch/s390/mm
parent227be799c39a28bf5d68187a4ea1b43190d96515 (diff)
s390/mm: split arch/s390/mm/pgtable.c
The pgtable.c file is quite big, before it grows any larger split it into pgtable.c, pgalloc.c and gmap.c. In addition move the gmap related header definitions into the new gmap.h header and all of the pgste helpers from pgtable.h to pgtable.c. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Diffstat (limited to 'arch/s390/mm')
-rw-r--r--arch/s390/mm/Makefile4
-rw-r--r--arch/s390/mm/fault.c1
-rw-r--r--arch/s390/mm/gmap.c774
-rw-r--r--arch/s390/mm/pgalloc.c360
-rw-r--r--arch/s390/mm/pgtable.c1532
5 files changed, 1378 insertions, 1293 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index 839592ca265c..2ae54cad2b6a 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -2,9 +2,11 @@
2# Makefile for the linux s390-specific parts of the memory manager. 2# Makefile for the linux s390-specific parts of the memory manager.
3# 3#
4 4
5obj-y := init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o 5obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o
6obj-y += page-states.o gup.o extable.o pageattr.o mem_detect.o 6obj-y += page-states.o gup.o extable.o pageattr.o mem_detect.o
7obj-y += pgtable.o pgalloc.o
7 8
8obj-$(CONFIG_CMM) += cmm.o 9obj-$(CONFIG_CMM) += cmm.o
9obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 10obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
10obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o 11obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o
12obj-$(CONFIG_PGSTE) += gmap.o
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 64b3ad1d6575..cce577feab1e 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -32,6 +32,7 @@
32#include <asm/asm-offsets.h> 32#include <asm/asm-offsets.h>
33#include <asm/diag.h> 33#include <asm/diag.h>
34#include <asm/pgtable.h> 34#include <asm/pgtable.h>
35#include <asm/gmap.h>
35#include <asm/irq.h> 36#include <asm/irq.h>
36#include <asm/mmu_context.h> 37#include <asm/mmu_context.h>
37#include <asm/facility.h> 38#include <asm/facility.h>
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
new file mode 100644
index 000000000000..69247b4dcc43
--- /dev/null
+++ b/arch/s390/mm/gmap.c
@@ -0,0 +1,774 @@
1/*
2 * KVM guest address space mapping code
3 *
4 * Copyright IBM Corp. 2007, 2016
5 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
6 */
7
8#include <linux/kernel.h>
9#include <linux/mm.h>
10#include <linux/swap.h>
11#include <linux/smp.h>
12#include <linux/spinlock.h>
13#include <linux/slab.h>
14#include <linux/swapops.h>
15#include <linux/ksm.h>
16#include <linux/mman.h>
17
18#include <asm/pgtable.h>
19#include <asm/pgalloc.h>
20#include <asm/gmap.h>
21#include <asm/tlb.h>
22
23/**
24 * gmap_alloc - allocate a guest address space
25 * @mm: pointer to the parent mm_struct
26 * @limit: maximum size of the gmap address space
27 *
28 * Returns a guest address space structure.
29 */
30struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
31{
32 struct gmap *gmap;
33 struct page *page;
34 unsigned long *table;
35 unsigned long etype, atype;
36
37 if (limit < (1UL << 31)) {
38 limit = (1UL << 31) - 1;
39 atype = _ASCE_TYPE_SEGMENT;
40 etype = _SEGMENT_ENTRY_EMPTY;
41 } else if (limit < (1UL << 42)) {
42 limit = (1UL << 42) - 1;
43 atype = _ASCE_TYPE_REGION3;
44 etype = _REGION3_ENTRY_EMPTY;
45 } else if (limit < (1UL << 53)) {
46 limit = (1UL << 53) - 1;
47 atype = _ASCE_TYPE_REGION2;
48 etype = _REGION2_ENTRY_EMPTY;
49 } else {
50 limit = -1UL;
51 atype = _ASCE_TYPE_REGION1;
52 etype = _REGION1_ENTRY_EMPTY;
53 }
54 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
55 if (!gmap)
56 goto out;
57 INIT_LIST_HEAD(&gmap->crst_list);
58 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
59 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
60 spin_lock_init(&gmap->guest_table_lock);
61 gmap->mm = mm;
62 page = alloc_pages(GFP_KERNEL, 2);
63 if (!page)
64 goto out_free;
65 page->index = 0;
66 list_add(&page->lru, &gmap->crst_list);
67 table = (unsigned long *) page_to_phys(page);
68 crst_table_init(table, etype);
69 gmap->table = table;
70 gmap->asce = atype | _ASCE_TABLE_LENGTH |
71 _ASCE_USER_BITS | __pa(table);
72 gmap->asce_end = limit;
73 down_write(&mm->mmap_sem);
74 list_add(&gmap->list, &mm->context.gmap_list);
75 up_write(&mm->mmap_sem);
76 return gmap;
77
78out_free:
79 kfree(gmap);
80out:
81 return NULL;
82}
83EXPORT_SYMBOL_GPL(gmap_alloc);
84
85static void gmap_flush_tlb(struct gmap *gmap)
86{
87 if (MACHINE_HAS_IDTE)
88 __tlb_flush_asce(gmap->mm, gmap->asce);
89 else
90 __tlb_flush_global();
91}
92
93static void gmap_radix_tree_free(struct radix_tree_root *root)
94{
95 struct radix_tree_iter iter;
96 unsigned long indices[16];
97 unsigned long index;
98 void **slot;
99 int i, nr;
100
101 /* A radix tree is freed by deleting all of its entries */
102 index = 0;
103 do {
104 nr = 0;
105 radix_tree_for_each_slot(slot, root, &iter, index) {
106 indices[nr] = iter.index;
107 if (++nr == 16)
108 break;
109 }
110 for (i = 0; i < nr; i++) {
111 index = indices[i];
112 radix_tree_delete(root, index);
113 }
114 } while (nr > 0);
115}
116
117/**
118 * gmap_free - free a guest address space
119 * @gmap: pointer to the guest address space structure
120 */
121void gmap_free(struct gmap *gmap)
122{
123 struct page *page, *next;
124
125 /* Flush tlb. */
126 if (MACHINE_HAS_IDTE)
127 __tlb_flush_asce(gmap->mm, gmap->asce);
128 else
129 __tlb_flush_global();
130
131 /* Free all segment & region tables. */
132 list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
133 __free_pages(page, 2);
134 gmap_radix_tree_free(&gmap->guest_to_host);
135 gmap_radix_tree_free(&gmap->host_to_guest);
136 down_write(&gmap->mm->mmap_sem);
137 list_del(&gmap->list);
138 up_write(&gmap->mm->mmap_sem);
139 kfree(gmap);
140}
141EXPORT_SYMBOL_GPL(gmap_free);
142
143/**
144 * gmap_enable - switch primary space to the guest address space
145 * @gmap: pointer to the guest address space structure
146 */
147void gmap_enable(struct gmap *gmap)
148{
149 S390_lowcore.gmap = (unsigned long) gmap;
150}
151EXPORT_SYMBOL_GPL(gmap_enable);
152
153/**
154 * gmap_disable - switch back to the standard primary address space
155 * @gmap: pointer to the guest address space structure
156 */
157void gmap_disable(struct gmap *gmap)
158{
159 S390_lowcore.gmap = 0UL;
160}
161EXPORT_SYMBOL_GPL(gmap_disable);
162
163/*
164 * gmap_alloc_table is assumed to be called with mmap_sem held
165 */
166static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
167 unsigned long init, unsigned long gaddr)
168{
169 struct page *page;
170 unsigned long *new;
171
172 /* since we dont free the gmap table until gmap_free we can unlock */
173 page = alloc_pages(GFP_KERNEL, 2);
174 if (!page)
175 return -ENOMEM;
176 new = (unsigned long *) page_to_phys(page);
177 crst_table_init(new, init);
178 spin_lock(&gmap->mm->page_table_lock);
179 if (*table & _REGION_ENTRY_INVALID) {
180 list_add(&page->lru, &gmap->crst_list);
181 *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
182 (*table & _REGION_ENTRY_TYPE_MASK);
183 page->index = gaddr;
184 page = NULL;
185 }
186 spin_unlock(&gmap->mm->page_table_lock);
187 if (page)
188 __free_pages(page, 2);
189 return 0;
190}
191
192/**
193 * __gmap_segment_gaddr - find virtual address from segment pointer
194 * @entry: pointer to a segment table entry in the guest address space
195 *
196 * Returns the virtual address in the guest address space for the segment
197 */
198static unsigned long __gmap_segment_gaddr(unsigned long *entry)
199{
200 struct page *page;
201 unsigned long offset, mask;
202
203 offset = (unsigned long) entry / sizeof(unsigned long);
204 offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
205 mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
206 page = virt_to_page((void *)((unsigned long) entry & mask));
207 return page->index + offset;
208}
209
210/**
211 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
212 * @gmap: pointer to the guest address space structure
213 * @vmaddr: address in the host process address space
214 *
215 * Returns 1 if a TLB flush is required
216 */
217static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
218{
219 unsigned long *entry;
220 int flush = 0;
221
222 spin_lock(&gmap->guest_table_lock);
223 entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
224 if (entry) {
225 flush = (*entry != _SEGMENT_ENTRY_INVALID);
226 *entry = _SEGMENT_ENTRY_INVALID;
227 }
228 spin_unlock(&gmap->guest_table_lock);
229 return flush;
230}
231
232/**
233 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
234 * @gmap: pointer to the guest address space structure
235 * @gaddr: address in the guest address space
236 *
237 * Returns 1 if a TLB flush is required
238 */
239static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
240{
241 unsigned long vmaddr;
242
243 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
244 gaddr >> PMD_SHIFT);
245 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
246}
247
248/**
249 * gmap_unmap_segment - unmap segment from the guest address space
250 * @gmap: pointer to the guest address space structure
251 * @to: address in the guest address space
252 * @len: length of the memory area to unmap
253 *
254 * Returns 0 if the unmap succeeded, -EINVAL if not.
255 */
256int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
257{
258 unsigned long off;
259 int flush;
260
261 if ((to | len) & (PMD_SIZE - 1))
262 return -EINVAL;
263 if (len == 0 || to + len < to)
264 return -EINVAL;
265
266 flush = 0;
267 down_write(&gmap->mm->mmap_sem);
268 for (off = 0; off < len; off += PMD_SIZE)
269 flush |= __gmap_unmap_by_gaddr(gmap, to + off);
270 up_write(&gmap->mm->mmap_sem);
271 if (flush)
272 gmap_flush_tlb(gmap);
273 return 0;
274}
275EXPORT_SYMBOL_GPL(gmap_unmap_segment);
276
277/**
278 * gmap_map_segment - map a segment to the guest address space
279 * @gmap: pointer to the guest address space structure
280 * @from: source address in the parent address space
281 * @to: target address in the guest address space
282 * @len: length of the memory area to map
283 *
284 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
285 */
286int gmap_map_segment(struct gmap *gmap, unsigned long from,
287 unsigned long to, unsigned long len)
288{
289 unsigned long off;
290 int flush;
291
292 if ((from | to | len) & (PMD_SIZE - 1))
293 return -EINVAL;
294 if (len == 0 || from + len < from || to + len < to ||
295 from + len > TASK_MAX_SIZE || to + len > gmap->asce_end)
296 return -EINVAL;
297
298 flush = 0;
299 down_write(&gmap->mm->mmap_sem);
300 for (off = 0; off < len; off += PMD_SIZE) {
301 /* Remove old translation */
302 flush |= __gmap_unmap_by_gaddr(gmap, to + off);
303 /* Store new translation */
304 if (radix_tree_insert(&gmap->guest_to_host,
305 (to + off) >> PMD_SHIFT,
306 (void *) from + off))
307 break;
308 }
309 up_write(&gmap->mm->mmap_sem);
310 if (flush)
311 gmap_flush_tlb(gmap);
312 if (off >= len)
313 return 0;
314 gmap_unmap_segment(gmap, to, len);
315 return -ENOMEM;
316}
317EXPORT_SYMBOL_GPL(gmap_map_segment);
318
319/**
320 * __gmap_translate - translate a guest address to a user space address
321 * @gmap: pointer to guest mapping meta data structure
322 * @gaddr: guest address
323 *
324 * Returns user space address which corresponds to the guest address or
325 * -EFAULT if no such mapping exists.
326 * This function does not establish potentially missing page table entries.
327 * The mmap_sem of the mm that belongs to the address space must be held
328 * when this function gets called.
329 */
330unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
331{
332 unsigned long vmaddr;
333
334 vmaddr = (unsigned long)
335 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
336 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
337}
338EXPORT_SYMBOL_GPL(__gmap_translate);
339
340/**
341 * gmap_translate - translate a guest address to a user space address
342 * @gmap: pointer to guest mapping meta data structure
343 * @gaddr: guest address
344 *
345 * Returns user space address which corresponds to the guest address or
346 * -EFAULT if no such mapping exists.
347 * This function does not establish potentially missing page table entries.
348 */
349unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
350{
351 unsigned long rc;
352
353 down_read(&gmap->mm->mmap_sem);
354 rc = __gmap_translate(gmap, gaddr);
355 up_read(&gmap->mm->mmap_sem);
356 return rc;
357}
358EXPORT_SYMBOL_GPL(gmap_translate);
359
360/**
361 * gmap_unlink - disconnect a page table from the gmap shadow tables
362 * @gmap: pointer to guest mapping meta data structure
363 * @table: pointer to the host page table
364 * @vmaddr: vm address associated with the host page table
365 */
366void gmap_unlink(struct mm_struct *mm, unsigned long *table,
367 unsigned long vmaddr)
368{
369 struct gmap *gmap;
370 int flush;
371
372 list_for_each_entry(gmap, &mm->context.gmap_list, list) {
373 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
374 if (flush)
375 gmap_flush_tlb(gmap);
376 }
377}
378
379/**
380 * gmap_link - set up shadow page tables to connect a host to a guest address
381 * @gmap: pointer to guest mapping meta data structure
382 * @gaddr: guest address
383 * @vmaddr: vm address
384 *
385 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
386 * if the vm address is already mapped to a different guest segment.
387 * The mmap_sem of the mm that belongs to the address space must be held
388 * when this function gets called.
389 */
390int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
391{
392 struct mm_struct *mm;
393 unsigned long *table;
394 spinlock_t *ptl;
395 pgd_t *pgd;
396 pud_t *pud;
397 pmd_t *pmd;
398 int rc;
399
400 /* Create higher level tables in the gmap page table */
401 table = gmap->table;
402 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
403 table += (gaddr >> 53) & 0x7ff;
404 if ((*table & _REGION_ENTRY_INVALID) &&
405 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
406 gaddr & 0xffe0000000000000UL))
407 return -ENOMEM;
408 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
409 }
410 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
411 table += (gaddr >> 42) & 0x7ff;
412 if ((*table & _REGION_ENTRY_INVALID) &&
413 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
414 gaddr & 0xfffffc0000000000UL))
415 return -ENOMEM;
416 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
417 }
418 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
419 table += (gaddr >> 31) & 0x7ff;
420 if ((*table & _REGION_ENTRY_INVALID) &&
421 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
422 gaddr & 0xffffffff80000000UL))
423 return -ENOMEM;
424 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
425 }
426 table += (gaddr >> 20) & 0x7ff;
427 /* Walk the parent mm page table */
428 mm = gmap->mm;
429 pgd = pgd_offset(mm, vmaddr);
430 VM_BUG_ON(pgd_none(*pgd));
431 pud = pud_offset(pgd, vmaddr);
432 VM_BUG_ON(pud_none(*pud));
433 pmd = pmd_offset(pud, vmaddr);
434 VM_BUG_ON(pmd_none(*pmd));
435 /* large pmds cannot yet be handled */
436 if (pmd_large(*pmd))
437 return -EFAULT;
438 /* Link gmap segment table entry location to page table. */
439 rc = radix_tree_preload(GFP_KERNEL);
440 if (rc)
441 return rc;
442 ptl = pmd_lock(mm, pmd);
443 spin_lock(&gmap->guest_table_lock);
444 if (*table == _SEGMENT_ENTRY_INVALID) {
445 rc = radix_tree_insert(&gmap->host_to_guest,
446 vmaddr >> PMD_SHIFT, table);
447 if (!rc)
448 *table = pmd_val(*pmd);
449 } else
450 rc = 0;
451 spin_unlock(&gmap->guest_table_lock);
452 spin_unlock(ptl);
453 radix_tree_preload_end();
454 return rc;
455}
456
457/**
458 * gmap_fault - resolve a fault on a guest address
459 * @gmap: pointer to guest mapping meta data structure
460 * @gaddr: guest address
461 * @fault_flags: flags to pass down to handle_mm_fault()
462 *
463 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
464 * if the vm address is already mapped to a different guest segment.
465 */
466int gmap_fault(struct gmap *gmap, unsigned long gaddr,
467 unsigned int fault_flags)
468{
469 unsigned long vmaddr;
470 int rc;
471 bool unlocked;
472
473 down_read(&gmap->mm->mmap_sem);
474
475retry:
476 unlocked = false;
477 vmaddr = __gmap_translate(gmap, gaddr);
478 if (IS_ERR_VALUE(vmaddr)) {
479 rc = vmaddr;
480 goto out_up;
481 }
482 if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
483 &unlocked)) {
484 rc = -EFAULT;
485 goto out_up;
486 }
487 /*
488 * In the case that fixup_user_fault unlocked the mmap_sem during
489 * faultin redo __gmap_translate to not race with a map/unmap_segment.
490 */
491 if (unlocked)
492 goto retry;
493
494 rc = __gmap_link(gmap, gaddr, vmaddr);
495out_up:
496 up_read(&gmap->mm->mmap_sem);
497 return rc;
498}
499EXPORT_SYMBOL_GPL(gmap_fault);
500
501/*
502 * this function is assumed to be called with mmap_sem held
503 */
504void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
505{
506 unsigned long vmaddr;
507 spinlock_t *ptl;
508 pte_t *ptep;
509
510 /* Find the vm address for the guest address */
511 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
512 gaddr >> PMD_SHIFT);
513 if (vmaddr) {
514 vmaddr |= gaddr & ~PMD_MASK;
515 /* Get pointer to the page table entry */
516 ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
517 if (likely(ptep))
518 ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
519 pte_unmap_unlock(ptep, ptl);
520 }
521}
522EXPORT_SYMBOL_GPL(__gmap_zap);
523
524void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
525{
526 unsigned long gaddr, vmaddr, size;
527 struct vm_area_struct *vma;
528
529 down_read(&gmap->mm->mmap_sem);
530 for (gaddr = from; gaddr < to;
531 gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
532 /* Find the vm address for the guest address */
533 vmaddr = (unsigned long)
534 radix_tree_lookup(&gmap->guest_to_host,
535 gaddr >> PMD_SHIFT);
536 if (!vmaddr)
537 continue;
538 vmaddr |= gaddr & ~PMD_MASK;
539 /* Find vma in the parent mm */
540 vma = find_vma(gmap->mm, vmaddr);
541 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
542 zap_page_range(vma, vmaddr, size, NULL);
543 }
544 up_read(&gmap->mm->mmap_sem);
545}
546EXPORT_SYMBOL_GPL(gmap_discard);
547
548static LIST_HEAD(gmap_notifier_list);
549static DEFINE_SPINLOCK(gmap_notifier_lock);
550
551/**
552 * gmap_register_ipte_notifier - register a pte invalidation callback
553 * @nb: pointer to the gmap notifier block
554 */
555void gmap_register_ipte_notifier(struct gmap_notifier *nb)
556{
557 spin_lock(&gmap_notifier_lock);
558 list_add(&nb->list, &gmap_notifier_list);
559 spin_unlock(&gmap_notifier_lock);
560}
561EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
562
563/**
564 * gmap_unregister_ipte_notifier - remove a pte invalidation callback
565 * @nb: pointer to the gmap notifier block
566 */
567void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
568{
569 spin_lock(&gmap_notifier_lock);
570 list_del_init(&nb->list);
571 spin_unlock(&gmap_notifier_lock);
572}
573EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
574
575/**
576 * gmap_ipte_notify - mark a range of ptes for invalidation notification
577 * @gmap: pointer to guest mapping meta data structure
578 * @gaddr: virtual address in the guest address space
579 * @len: size of area
580 *
581 * Returns 0 if for each page in the given range a gmap mapping exists and
582 * the invalidation notification could be set. If the gmap mapping is missing
583 * for one or more pages -EFAULT is returned. If no memory could be allocated
584 * -ENOMEM is returned. This function establishes missing page table entries.
585 */
586int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
587{
588 unsigned long addr;
589 spinlock_t *ptl;
590 pte_t *ptep;
591 bool unlocked;
592 int rc = 0;
593
594 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
595 return -EINVAL;
596 down_read(&gmap->mm->mmap_sem);
597 while (len) {
598 unlocked = false;
599 /* Convert gmap address and connect the page tables */
600 addr = __gmap_translate(gmap, gaddr);
601 if (IS_ERR_VALUE(addr)) {
602 rc = addr;
603 break;
604 }
605 /* Get the page mapped */
606 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
607 &unlocked)) {
608 rc = -EFAULT;
609 break;
610 }
611 /* While trying to map mmap_sem got unlocked. Let us retry */
612 if (unlocked)
613 continue;
614 rc = __gmap_link(gmap, gaddr, addr);
615 if (rc)
616 break;
617 /* Walk the process page table, lock and get pte pointer */
618 ptep = get_locked_pte(gmap->mm, addr, &ptl);
619 VM_BUG_ON(!ptep);
620 /* Set notification bit in the pgste of the pte */
621 if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
622 ptep_set_notify(gmap->mm, addr, ptep);
623 gaddr += PAGE_SIZE;
624 len -= PAGE_SIZE;
625 }
626 pte_unmap_unlock(ptep, ptl);
627 }
628 up_read(&gmap->mm->mmap_sem);
629 return rc;
630}
631EXPORT_SYMBOL_GPL(gmap_ipte_notify);
632
633/**
634 * ptep_notify - call all invalidation callbacks for a specific pte.
635 * @mm: pointer to the process mm_struct
636 * @addr: virtual address in the process address space
637 * @pte: pointer to the page table entry
638 *
639 * This function is assumed to be called with the page table lock held
640 * for the pte to notify.
641 */
642void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
643{
644 unsigned long offset, gaddr;
645 unsigned long *table;
646 struct gmap_notifier *nb;
647 struct gmap *gmap;
648
649 offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
650 offset = offset * (4096 / sizeof(pte_t));
651 spin_lock(&gmap_notifier_lock);
652 list_for_each_entry(gmap, &mm->context.gmap_list, list) {
653 table = radix_tree_lookup(&gmap->host_to_guest,
654 vmaddr >> PMD_SHIFT);
655 if (!table)
656 continue;
657 gaddr = __gmap_segment_gaddr(table) + offset;
658 list_for_each_entry(nb, &gmap_notifier_list, list)
659 nb->notifier_call(gmap, gaddr);
660 }
661 spin_unlock(&gmap_notifier_lock);
662}
663EXPORT_SYMBOL_GPL(ptep_notify);
664
665static inline void thp_split_mm(struct mm_struct *mm)
666{
667#ifdef CONFIG_TRANSPARENT_HUGEPAGE
668 struct vm_area_struct *vma;
669 unsigned long addr;
670
671 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
672 for (addr = vma->vm_start;
673 addr < vma->vm_end;
674 addr += PAGE_SIZE)
675 follow_page(vma, addr, FOLL_SPLIT);
676 vma->vm_flags &= ~VM_HUGEPAGE;
677 vma->vm_flags |= VM_NOHUGEPAGE;
678 }
679 mm->def_flags |= VM_NOHUGEPAGE;
680#endif
681}
682
683/*
684 * switch on pgstes for its userspace process (for kvm)
685 */
686int s390_enable_sie(void)
687{
688 struct mm_struct *mm = current->mm;
689
690 /* Do we have pgstes? if yes, we are done */
691 if (mm_has_pgste(mm))
692 return 0;
693 /* Fail if the page tables are 2K */
694 if (!mm_alloc_pgste(mm))
695 return -EINVAL;
696 down_write(&mm->mmap_sem);
697 mm->context.has_pgste = 1;
698 /* split thp mappings and disable thp for future mappings */
699 thp_split_mm(mm);
700 up_write(&mm->mmap_sem);
701 return 0;
702}
703EXPORT_SYMBOL_GPL(s390_enable_sie);
704
705/*
706 * Enable storage key handling from now on and initialize the storage
707 * keys with the default key.
708 */
709static int __s390_enable_skey(pte_t *pte, unsigned long addr,
710 unsigned long next, struct mm_walk *walk)
711{
712 /*
713 * Remove all zero page mappings,
714 * after establishing a policy to forbid zero page mappings
715 * following faults for that page will get fresh anonymous pages
716 */
717 if (is_zero_pfn(pte_pfn(*pte)))
718 ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID));
719 /* Clear storage key */
720 ptep_zap_key(walk->mm, addr, pte);
721 return 0;
722}
723
724int s390_enable_skey(void)
725{
726 struct mm_walk walk = { .pte_entry = __s390_enable_skey };
727 struct mm_struct *mm = current->mm;
728 struct vm_area_struct *vma;
729 int rc = 0;
730
731 down_write(&mm->mmap_sem);
732 if (mm_use_skey(mm))
733 goto out_up;
734
735 mm->context.use_skey = 1;
736 for (vma = mm->mmap; vma; vma = vma->vm_next) {
737 if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
738 MADV_UNMERGEABLE, &vma->vm_flags)) {
739 mm->context.use_skey = 0;
740 rc = -ENOMEM;
741 goto out_up;
742 }
743 }
744 mm->def_flags &= ~VM_MERGEABLE;
745
746 walk.mm = mm;
747 walk_page_range(0, TASK_SIZE, &walk);
748
749out_up:
750 up_write(&mm->mmap_sem);
751 return rc;
752}
753EXPORT_SYMBOL_GPL(s390_enable_skey);
754
755/*
756 * Reset CMMA state, make all pages stable again.
757 */
758static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
759 unsigned long next, struct mm_walk *walk)
760{
761 ptep_zap_unused(walk->mm, addr, pte, 1);
762 return 0;
763}
764
765void s390_reset_cmma(struct mm_struct *mm)
766{
767 struct mm_walk walk = { .pte_entry = __s390_reset_cmma };
768
769 down_write(&mm->mmap_sem);
770 walk.mm = mm;
771 walk_page_range(0, TASK_SIZE, &walk);
772 up_write(&mm->mmap_sem);
773}
774EXPORT_SYMBOL_GPL(s390_reset_cmma);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
new file mode 100644
index 000000000000..f6c3de26cda8
--- /dev/null
+++ b/arch/s390/mm/pgalloc.c
@@ -0,0 +1,360 @@
1/*
2 * Page table allocation functions
3 *
4 * Copyright IBM Corp. 2016
5 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
6 */
7
8#include <linux/mm.h>
9#include <linux/sysctl.h>
10#include <asm/mmu_context.h>
11#include <asm/pgalloc.h>
12#include <asm/gmap.h>
13#include <asm/tlb.h>
14#include <asm/tlbflush.h>
15
16#ifdef CONFIG_PGSTE
17
18static int page_table_allocate_pgste_min = 0;
19static int page_table_allocate_pgste_max = 1;
20int page_table_allocate_pgste = 0;
21EXPORT_SYMBOL(page_table_allocate_pgste);
22
23static struct ctl_table page_table_sysctl[] = {
24 {
25 .procname = "allocate_pgste",
26 .data = &page_table_allocate_pgste,
27 .maxlen = sizeof(int),
28 .mode = S_IRUGO | S_IWUSR,
29 .proc_handler = proc_dointvec,
30 .extra1 = &page_table_allocate_pgste_min,
31 .extra2 = &page_table_allocate_pgste_max,
32 },
33 { }
34};
35
36static struct ctl_table page_table_sysctl_dir[] = {
37 {
38 .procname = "vm",
39 .maxlen = 0,
40 .mode = 0555,
41 .child = page_table_sysctl,
42 },
43 { }
44};
45
46static int __init page_table_register_sysctl(void)
47{
48 return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
49}
50__initcall(page_table_register_sysctl);
51
52#endif /* CONFIG_PGSTE */
53
54unsigned long *crst_table_alloc(struct mm_struct *mm)
55{
56 struct page *page = alloc_pages(GFP_KERNEL, 2);
57
58 if (!page)
59 return NULL;
60 return (unsigned long *) page_to_phys(page);
61}
62
63void crst_table_free(struct mm_struct *mm, unsigned long *table)
64{
65 free_pages((unsigned long) table, 2);
66}
67
68static void __crst_table_upgrade(void *arg)
69{
70 struct mm_struct *mm = arg;
71
72 if (current->active_mm == mm) {
73 clear_user_asce();
74 set_user_asce(mm);
75 }
76 __tlb_flush_local();
77}
78
79int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
80{
81 unsigned long *table, *pgd;
82 unsigned long entry;
83 int flush;
84
85 BUG_ON(limit > TASK_MAX_SIZE);
86 flush = 0;
87repeat:
88 table = crst_table_alloc(mm);
89 if (!table)
90 return -ENOMEM;
91 spin_lock_bh(&mm->page_table_lock);
92 if (mm->context.asce_limit < limit) {
93 pgd = (unsigned long *) mm->pgd;
94 if (mm->context.asce_limit <= (1UL << 31)) {
95 entry = _REGION3_ENTRY_EMPTY;
96 mm->context.asce_limit = 1UL << 42;
97 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
98 _ASCE_USER_BITS |
99 _ASCE_TYPE_REGION3;
100 } else {
101 entry = _REGION2_ENTRY_EMPTY;
102 mm->context.asce_limit = 1UL << 53;
103 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
104 _ASCE_USER_BITS |
105 _ASCE_TYPE_REGION2;
106 }
107 crst_table_init(table, entry);
108 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
109 mm->pgd = (pgd_t *) table;
110 mm->task_size = mm->context.asce_limit;
111 table = NULL;
112 flush = 1;
113 }
114 spin_unlock_bh(&mm->page_table_lock);
115 if (table)
116 crst_table_free(mm, table);
117 if (mm->context.asce_limit < limit)
118 goto repeat;
119 if (flush)
120 on_each_cpu(__crst_table_upgrade, mm, 0);
121 return 0;
122}
123
124void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
125{
126 pgd_t *pgd;
127
128 if (current->active_mm == mm) {
129 clear_user_asce();
130 __tlb_flush_mm(mm);
131 }
132 while (mm->context.asce_limit > limit) {
133 pgd = mm->pgd;
134 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
135 case _REGION_ENTRY_TYPE_R2:
136 mm->context.asce_limit = 1UL << 42;
137 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
138 _ASCE_USER_BITS |
139 _ASCE_TYPE_REGION3;
140 break;
141 case _REGION_ENTRY_TYPE_R3:
142 mm->context.asce_limit = 1UL << 31;
143 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
144 _ASCE_USER_BITS |
145 _ASCE_TYPE_SEGMENT;
146 break;
147 default:
148 BUG();
149 }
150 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
151 mm->task_size = mm->context.asce_limit;
152 crst_table_free(mm, (unsigned long *) pgd);
153 }
154 if (current->active_mm == mm)
155 set_user_asce(mm);
156}
157
158static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
159{
160 unsigned int old, new;
161
162 do {
163 old = atomic_read(v);
164 new = old ^ bits;
165 } while (atomic_cmpxchg(v, old, new) != old);
166 return new;
167}
168
169/*
170 * page table entry allocation/free routines.
171 */
172unsigned long *page_table_alloc(struct mm_struct *mm)
173{
174 unsigned long *table;
175 struct page *page;
176 unsigned int mask, bit;
177
178 /* Try to get a fragment of a 4K page as a 2K page table */
179 if (!mm_alloc_pgste(mm)) {
180 table = NULL;
181 spin_lock_bh(&mm->context.list_lock);
182 if (!list_empty(&mm->context.pgtable_list)) {
183 page = list_first_entry(&mm->context.pgtable_list,
184 struct page, lru);
185 mask = atomic_read(&page->_mapcount);
186 mask = (mask | (mask >> 4)) & 3;
187 if (mask != 3) {
188 table = (unsigned long *) page_to_phys(page);
189 bit = mask & 1; /* =1 -> second 2K */
190 if (bit)
191 table += PTRS_PER_PTE;
192 atomic_xor_bits(&page->_mapcount, 1U << bit);
193 list_del(&page->lru);
194 }
195 }
196 spin_unlock_bh(&mm->context.list_lock);
197 if (table)
198 return table;
199 }
200 /* Allocate a fresh page */
201 page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
202 if (!page)
203 return NULL;
204 if (!pgtable_page_ctor(page)) {
205 __free_page(page);
206 return NULL;
207 }
208 /* Initialize page table */
209 table = (unsigned long *) page_to_phys(page);
210 if (mm_alloc_pgste(mm)) {
211 /* Return 4K page table with PGSTEs */
212 atomic_set(&page->_mapcount, 3);
213 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
214 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
215 } else {
216 /* Return the first 2K fragment of the page */
217 atomic_set(&page->_mapcount, 1);
218 clear_table(table, _PAGE_INVALID, PAGE_SIZE);
219 spin_lock_bh(&mm->context.list_lock);
220 list_add(&page->lru, &mm->context.pgtable_list);
221 spin_unlock_bh(&mm->context.list_lock);
222 }
223 return table;
224}
225
226void page_table_free(struct mm_struct *mm, unsigned long *table)
227{
228 struct page *page;
229 unsigned int bit, mask;
230
231 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
232 if (!mm_alloc_pgste(mm)) {
233 /* Free 2K page table fragment of a 4K page */
234 bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
235 spin_lock_bh(&mm->context.list_lock);
236 mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
237 if (mask & 3)
238 list_add(&page->lru, &mm->context.pgtable_list);
239 else
240 list_del(&page->lru);
241 spin_unlock_bh(&mm->context.list_lock);
242 if (mask != 0)
243 return;
244 }
245
246 pgtable_page_dtor(page);
247 atomic_set(&page->_mapcount, -1);
248 __free_page(page);
249}
250
251void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
252 unsigned long vmaddr)
253{
254 struct mm_struct *mm;
255 struct page *page;
256 unsigned int bit, mask;
257
258 mm = tlb->mm;
259 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
260 if (mm_alloc_pgste(mm)) {
261 gmap_unlink(mm, table, vmaddr);
262 table = (unsigned long *) (__pa(table) | 3);
263 tlb_remove_table(tlb, table);
264 return;
265 }
266 bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
267 spin_lock_bh(&mm->context.list_lock);
268 mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
269 if (mask & 3)
270 list_add_tail(&page->lru, &mm->context.pgtable_list);
271 else
272 list_del(&page->lru);
273 spin_unlock_bh(&mm->context.list_lock);
274 table = (unsigned long *) (__pa(table) | (1U << bit));
275 tlb_remove_table(tlb, table);
276}
277
278static void __tlb_remove_table(void *_table)
279{
280 unsigned int mask = (unsigned long) _table & 3;
281 void *table = (void *)((unsigned long) _table ^ mask);
282 struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
283
284 switch (mask) {
285 case 0: /* pmd or pud */
286 free_pages((unsigned long) table, 2);
287 break;
288 case 1: /* lower 2K of a 4K page table */
289 case 2: /* higher 2K of a 4K page table */
290 if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0)
291 break;
292 /* fallthrough */
293 case 3: /* 4K page table with pgstes */
294 pgtable_page_dtor(page);
295 atomic_set(&page->_mapcount, -1);
296 __free_page(page);
297 break;
298 }
299}
300
301static void tlb_remove_table_smp_sync(void *arg)
302{
303 /* Simply deliver the interrupt */
304}
305
306static void tlb_remove_table_one(void *table)
307{
308 /*
309 * This isn't an RCU grace period and hence the page-tables cannot be
310 * assumed to be actually RCU-freed.
311 *
312 * It is however sufficient for software page-table walkers that rely
313 * on IRQ disabling. See the comment near struct mmu_table_batch.
314 */
315 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
316 __tlb_remove_table(table);
317}
318
319static void tlb_remove_table_rcu(struct rcu_head *head)
320{
321 struct mmu_table_batch *batch;
322 int i;
323
324 batch = container_of(head, struct mmu_table_batch, rcu);
325
326 for (i = 0; i < batch->nr; i++)
327 __tlb_remove_table(batch->tables[i]);
328
329 free_page((unsigned long)batch);
330}
331
332void tlb_table_flush(struct mmu_gather *tlb)
333{
334 struct mmu_table_batch **batch = &tlb->batch;
335
336 if (*batch) {
337 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
338 *batch = NULL;
339 }
340}
341
342void tlb_remove_table(struct mmu_gather *tlb, void *table)
343{
344 struct mmu_table_batch **batch = &tlb->batch;
345
346 tlb->mm->context.flush_mm = 1;
347 if (*batch == NULL) {
348 *batch = (struct mmu_table_batch *)
349 __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
350 if (*batch == NULL) {
351 __tlb_flush_mm_lazy(tlb->mm);
352 tlb_remove_table_one(table);
353 return;
354 }
355 (*batch)->nr = 0;
356 }
357 (*batch)->tables[(*batch)->nr++] = table;
358 if ((*batch)->nr == MAX_TABLE_BATCH)
359 tlb_flush_mmu(tlb);
360}
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index e24126208614..4324b87f9398 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -24,1140 +24,6 @@
24#include <asm/tlbflush.h> 24#include <asm/tlbflush.h>
25#include <asm/mmu_context.h> 25#include <asm/mmu_context.h>
26 26
27unsigned long *crst_table_alloc(struct mm_struct *mm)
28{
29 struct page *page = alloc_pages(GFP_KERNEL, 2);
30
31 if (!page)
32 return NULL;
33 return (unsigned long *) page_to_phys(page);
34}
35
36void crst_table_free(struct mm_struct *mm, unsigned long *table)
37{
38 free_pages((unsigned long) table, 2);
39}
40
41static void __crst_table_upgrade(void *arg)
42{
43 struct mm_struct *mm = arg;
44
45 if (current->active_mm == mm) {
46 clear_user_asce();
47 set_user_asce(mm);
48 }
49 __tlb_flush_local();
50}
51
52int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
53{
54 unsigned long *table, *pgd;
55 unsigned long entry;
56 int flush;
57
58 BUG_ON(limit > TASK_MAX_SIZE);
59 flush = 0;
60repeat:
61 table = crst_table_alloc(mm);
62 if (!table)
63 return -ENOMEM;
64 spin_lock_bh(&mm->page_table_lock);
65 if (mm->context.asce_limit < limit) {
66 pgd = (unsigned long *) mm->pgd;
67 if (mm->context.asce_limit <= (1UL << 31)) {
68 entry = _REGION3_ENTRY_EMPTY;
69 mm->context.asce_limit = 1UL << 42;
70 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
71 _ASCE_USER_BITS |
72 _ASCE_TYPE_REGION3;
73 } else {
74 entry = _REGION2_ENTRY_EMPTY;
75 mm->context.asce_limit = 1UL << 53;
76 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
77 _ASCE_USER_BITS |
78 _ASCE_TYPE_REGION2;
79 }
80 crst_table_init(table, entry);
81 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
82 mm->pgd = (pgd_t *) table;
83 mm->task_size = mm->context.asce_limit;
84 table = NULL;
85 flush = 1;
86 }
87 spin_unlock_bh(&mm->page_table_lock);
88 if (table)
89 crst_table_free(mm, table);
90 if (mm->context.asce_limit < limit)
91 goto repeat;
92 if (flush)
93 on_each_cpu(__crst_table_upgrade, mm, 0);
94 return 0;
95}
96
97void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
98{
99 pgd_t *pgd;
100
101 if (current->active_mm == mm) {
102 clear_user_asce();
103 __tlb_flush_mm(mm);
104 }
105 while (mm->context.asce_limit > limit) {
106 pgd = mm->pgd;
107 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
108 case _REGION_ENTRY_TYPE_R2:
109 mm->context.asce_limit = 1UL << 42;
110 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
111 _ASCE_USER_BITS |
112 _ASCE_TYPE_REGION3;
113 break;
114 case _REGION_ENTRY_TYPE_R3:
115 mm->context.asce_limit = 1UL << 31;
116 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
117 _ASCE_USER_BITS |
118 _ASCE_TYPE_SEGMENT;
119 break;
120 default:
121 BUG();
122 }
123 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
124 mm->task_size = mm->context.asce_limit;
125 crst_table_free(mm, (unsigned long *) pgd);
126 }
127 if (current->active_mm == mm)
128 set_user_asce(mm);
129}
130
131#ifdef CONFIG_PGSTE
132
133/**
134 * gmap_alloc - allocate a guest address space
135 * @mm: pointer to the parent mm_struct
136 * @limit: maximum address of the gmap address space
137 *
138 * Returns a guest address space structure.
139 */
140struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
141{
142 struct gmap *gmap;
143 struct page *page;
144 unsigned long *table;
145 unsigned long etype, atype;
146
147 if (limit < (1UL << 31)) {
148 limit = (1UL << 31) - 1;
149 atype = _ASCE_TYPE_SEGMENT;
150 etype = _SEGMENT_ENTRY_EMPTY;
151 } else if (limit < (1UL << 42)) {
152 limit = (1UL << 42) - 1;
153 atype = _ASCE_TYPE_REGION3;
154 etype = _REGION3_ENTRY_EMPTY;
155 } else if (limit < (1UL << 53)) {
156 limit = (1UL << 53) - 1;
157 atype = _ASCE_TYPE_REGION2;
158 etype = _REGION2_ENTRY_EMPTY;
159 } else {
160 limit = -1UL;
161 atype = _ASCE_TYPE_REGION1;
162 etype = _REGION1_ENTRY_EMPTY;
163 }
164 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
165 if (!gmap)
166 goto out;
167 INIT_LIST_HEAD(&gmap->crst_list);
168 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
169 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
170 spin_lock_init(&gmap->guest_table_lock);
171 gmap->mm = mm;
172 page = alloc_pages(GFP_KERNEL, 2);
173 if (!page)
174 goto out_free;
175 page->index = 0;
176 list_add(&page->lru, &gmap->crst_list);
177 table = (unsigned long *) page_to_phys(page);
178 crst_table_init(table, etype);
179 gmap->table = table;
180 gmap->asce = atype | _ASCE_TABLE_LENGTH |
181 _ASCE_USER_BITS | __pa(table);
182 gmap->asce_end = limit;
183 down_write(&mm->mmap_sem);
184 list_add(&gmap->list, &mm->context.gmap_list);
185 up_write(&mm->mmap_sem);
186 return gmap;
187
188out_free:
189 kfree(gmap);
190out:
191 return NULL;
192}
193EXPORT_SYMBOL_GPL(gmap_alloc);
194
195static void gmap_flush_tlb(struct gmap *gmap)
196{
197 if (MACHINE_HAS_IDTE)
198 __tlb_flush_asce(gmap->mm, gmap->asce);
199 else
200 __tlb_flush_global();
201}
202
203static void gmap_radix_tree_free(struct radix_tree_root *root)
204{
205 struct radix_tree_iter iter;
206 unsigned long indices[16];
207 unsigned long index;
208 void **slot;
209 int i, nr;
210
211 /* A radix tree is freed by deleting all of its entries */
212 index = 0;
213 do {
214 nr = 0;
215 radix_tree_for_each_slot(slot, root, &iter, index) {
216 indices[nr] = iter.index;
217 if (++nr == 16)
218 break;
219 }
220 for (i = 0; i < nr; i++) {
221 index = indices[i];
222 radix_tree_delete(root, index);
223 }
224 } while (nr > 0);
225}
226
227/**
228 * gmap_free - free a guest address space
229 * @gmap: pointer to the guest address space structure
230 */
231void gmap_free(struct gmap *gmap)
232{
233 struct page *page, *next;
234
235 /* Flush tlb. */
236 if (MACHINE_HAS_IDTE)
237 __tlb_flush_asce(gmap->mm, gmap->asce);
238 else
239 __tlb_flush_global();
240
241 /* Free all segment & region tables. */
242 list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
243 __free_pages(page, 2);
244 gmap_radix_tree_free(&gmap->guest_to_host);
245 gmap_radix_tree_free(&gmap->host_to_guest);
246 down_write(&gmap->mm->mmap_sem);
247 list_del(&gmap->list);
248 up_write(&gmap->mm->mmap_sem);
249 kfree(gmap);
250}
251EXPORT_SYMBOL_GPL(gmap_free);
252
253/**
254 * gmap_enable - switch primary space to the guest address space
255 * @gmap: pointer to the guest address space structure
256 */
257void gmap_enable(struct gmap *gmap)
258{
259 S390_lowcore.gmap = (unsigned long) gmap;
260}
261EXPORT_SYMBOL_GPL(gmap_enable);
262
263/**
264 * gmap_disable - switch back to the standard primary address space
265 * @gmap: pointer to the guest address space structure
266 */
267void gmap_disable(struct gmap *gmap)
268{
269 S390_lowcore.gmap = 0UL;
270}
271EXPORT_SYMBOL_GPL(gmap_disable);
272
273/*
274 * gmap_alloc_table is assumed to be called with mmap_sem held
275 */
276static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
277 unsigned long init, unsigned long gaddr)
278{
279 struct page *page;
280 unsigned long *new;
281
282 /* since we dont free the gmap table until gmap_free we can unlock */
283 page = alloc_pages(GFP_KERNEL, 2);
284 if (!page)
285 return -ENOMEM;
286 new = (unsigned long *) page_to_phys(page);
287 crst_table_init(new, init);
288 spin_lock(&gmap->mm->page_table_lock);
289 if (*table & _REGION_ENTRY_INVALID) {
290 list_add(&page->lru, &gmap->crst_list);
291 *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
292 (*table & _REGION_ENTRY_TYPE_MASK);
293 page->index = gaddr;
294 page = NULL;
295 }
296 spin_unlock(&gmap->mm->page_table_lock);
297 if (page)
298 __free_pages(page, 2);
299 return 0;
300}
301
302/**
303 * __gmap_segment_gaddr - find virtual address from segment pointer
304 * @entry: pointer to a segment table entry in the guest address space
305 *
306 * Returns the virtual address in the guest address space for the segment
307 */
308static unsigned long __gmap_segment_gaddr(unsigned long *entry)
309{
310 struct page *page;
311 unsigned long offset, mask;
312
313 offset = (unsigned long) entry / sizeof(unsigned long);
314 offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
315 mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
316 page = virt_to_page((void *)((unsigned long) entry & mask));
317 return page->index + offset;
318}
319
320/**
321 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
322 * @gmap: pointer to the guest address space structure
323 * @vmaddr: address in the host process address space
324 *
325 * Returns 1 if a TLB flush is required
326 */
327static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
328{
329 unsigned long *entry;
330 int flush = 0;
331
332 spin_lock(&gmap->guest_table_lock);
333 entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
334 if (entry) {
335 flush = (*entry != _SEGMENT_ENTRY_INVALID);
336 *entry = _SEGMENT_ENTRY_INVALID;
337 }
338 spin_unlock(&gmap->guest_table_lock);
339 return flush;
340}
341
342/**
343 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
344 * @gmap: pointer to the guest address space structure
345 * @gaddr: address in the guest address space
346 *
347 * Returns 1 if a TLB flush is required
348 */
349static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
350{
351 unsigned long vmaddr;
352
353 vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
354 gaddr >> PMD_SHIFT);
355 return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
356}
357
358/**
359 * gmap_unmap_segment - unmap segment from the guest address space
360 * @gmap: pointer to the guest address space structure
361 * @to: address in the guest address space
362 * @len: length of the memory area to unmap
363 *
364 * Returns 0 if the unmap succeeded, -EINVAL if not.
365 */
366int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
367{
368 unsigned long off;
369 int flush;
370
371 if ((to | len) & (PMD_SIZE - 1))
372 return -EINVAL;
373 if (len == 0 || to + len < to)
374 return -EINVAL;
375
376 flush = 0;
377 down_write(&gmap->mm->mmap_sem);
378 for (off = 0; off < len; off += PMD_SIZE)
379 flush |= __gmap_unmap_by_gaddr(gmap, to + off);
380 up_write(&gmap->mm->mmap_sem);
381 if (flush)
382 gmap_flush_tlb(gmap);
383 return 0;
384}
385EXPORT_SYMBOL_GPL(gmap_unmap_segment);
386
387/**
388 * gmap_mmap_segment - map a segment to the guest address space
389 * @gmap: pointer to the guest address space structure
390 * @from: source address in the parent address space
391 * @to: target address in the guest address space
392 * @len: length of the memory area to map
393 *
394 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
395 */
396int gmap_map_segment(struct gmap *gmap, unsigned long from,
397 unsigned long to, unsigned long len)
398{
399 unsigned long off;
400 int flush;
401
402 if ((from | to | len) & (PMD_SIZE - 1))
403 return -EINVAL;
404 if (len == 0 || from + len < from || to + len < to ||
405 from + len - 1 > TASK_MAX_SIZE || to + len - 1 > gmap->asce_end)
406 return -EINVAL;
407
408 flush = 0;
409 down_write(&gmap->mm->mmap_sem);
410 for (off = 0; off < len; off += PMD_SIZE) {
411 /* Remove old translation */
412 flush |= __gmap_unmap_by_gaddr(gmap, to + off);
413 /* Store new translation */
414 if (radix_tree_insert(&gmap->guest_to_host,
415 (to + off) >> PMD_SHIFT,
416 (void *) from + off))
417 break;
418 }
419 up_write(&gmap->mm->mmap_sem);
420 if (flush)
421 gmap_flush_tlb(gmap);
422 if (off >= len)
423 return 0;
424 gmap_unmap_segment(gmap, to, len);
425 return -ENOMEM;
426}
427EXPORT_SYMBOL_GPL(gmap_map_segment);
428
429/**
430 * __gmap_translate - translate a guest address to a user space address
431 * @gmap: pointer to guest mapping meta data structure
432 * @gaddr: guest address
433 *
434 * Returns user space address which corresponds to the guest address or
435 * -EFAULT if no such mapping exists.
436 * This function does not establish potentially missing page table entries.
437 * The mmap_sem of the mm that belongs to the address space must be held
438 * when this function gets called.
439 */
440unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
441{
442 unsigned long vmaddr;
443
444 vmaddr = (unsigned long)
445 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
446 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
447}
448EXPORT_SYMBOL_GPL(__gmap_translate);
449
450/**
451 * gmap_translate - translate a guest address to a user space address
452 * @gmap: pointer to guest mapping meta data structure
453 * @gaddr: guest address
454 *
455 * Returns user space address which corresponds to the guest address or
456 * -EFAULT if no such mapping exists.
457 * This function does not establish potentially missing page table entries.
458 */
459unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
460{
461 unsigned long rc;
462
463 down_read(&gmap->mm->mmap_sem);
464 rc = __gmap_translate(gmap, gaddr);
465 up_read(&gmap->mm->mmap_sem);
466 return rc;
467}
468EXPORT_SYMBOL_GPL(gmap_translate);
469
470/**
471 * gmap_unlink - disconnect a page table from the gmap shadow tables
472 * @gmap: pointer to guest mapping meta data structure
473 * @table: pointer to the host page table
474 * @vmaddr: vm address associated with the host page table
475 */
476static void gmap_unlink(struct mm_struct *mm, unsigned long *table,
477 unsigned long vmaddr)
478{
479 struct gmap *gmap;
480 int flush;
481
482 list_for_each_entry(gmap, &mm->context.gmap_list, list) {
483 flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
484 if (flush)
485 gmap_flush_tlb(gmap);
486 }
487}
488
489/**
490 * gmap_link - set up shadow page tables to connect a host to a guest address
491 * @gmap: pointer to guest mapping meta data structure
492 * @gaddr: guest address
493 * @vmaddr: vm address
494 *
495 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
496 * if the vm address is already mapped to a different guest segment.
497 * The mmap_sem of the mm that belongs to the address space must be held
498 * when this function gets called.
499 */
500int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
501{
502 struct mm_struct *mm;
503 unsigned long *table;
504 spinlock_t *ptl;
505 pgd_t *pgd;
506 pud_t *pud;
507 pmd_t *pmd;
508 int rc;
509
510 /* Create higher level tables in the gmap page table */
511 table = gmap->table;
512 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
513 table += (gaddr >> 53) & 0x7ff;
514 if ((*table & _REGION_ENTRY_INVALID) &&
515 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
516 gaddr & 0xffe0000000000000UL))
517 return -ENOMEM;
518 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
519 }
520 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
521 table += (gaddr >> 42) & 0x7ff;
522 if ((*table & _REGION_ENTRY_INVALID) &&
523 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
524 gaddr & 0xfffffc0000000000UL))
525 return -ENOMEM;
526 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
527 }
528 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
529 table += (gaddr >> 31) & 0x7ff;
530 if ((*table & _REGION_ENTRY_INVALID) &&
531 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
532 gaddr & 0xffffffff80000000UL))
533 return -ENOMEM;
534 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
535 }
536 table += (gaddr >> 20) & 0x7ff;
537 /* Walk the parent mm page table */
538 mm = gmap->mm;
539 pgd = pgd_offset(mm, vmaddr);
540 VM_BUG_ON(pgd_none(*pgd));
541 pud = pud_offset(pgd, vmaddr);
542 VM_BUG_ON(pud_none(*pud));
543 pmd = pmd_offset(pud, vmaddr);
544 VM_BUG_ON(pmd_none(*pmd));
545 /* large pmds cannot yet be handled */
546 if (pmd_large(*pmd))
547 return -EFAULT;
548 /* Link gmap segment table entry location to page table. */
549 rc = radix_tree_preload(GFP_KERNEL);
550 if (rc)
551 return rc;
552 ptl = pmd_lock(mm, pmd);
553 spin_lock(&gmap->guest_table_lock);
554 if (*table == _SEGMENT_ENTRY_INVALID) {
555 rc = radix_tree_insert(&gmap->host_to_guest,
556 vmaddr >> PMD_SHIFT, table);
557 if (!rc)
558 *table = pmd_val(*pmd);
559 } else
560 rc = 0;
561 spin_unlock(&gmap->guest_table_lock);
562 spin_unlock(ptl);
563 radix_tree_preload_end();
564 return rc;
565}
566
567/**
568 * gmap_fault - resolve a fault on a guest address
569 * @gmap: pointer to guest mapping meta data structure
570 * @gaddr: guest address
571 * @fault_flags: flags to pass down to handle_mm_fault()
572 *
573 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
574 * if the vm address is already mapped to a different guest segment.
575 */
576int gmap_fault(struct gmap *gmap, unsigned long gaddr,
577 unsigned int fault_flags)
578{
579 unsigned long vmaddr;
580 int rc;
581 bool unlocked;
582
583 down_read(&gmap->mm->mmap_sem);
584
585retry:
586 unlocked = false;
587 vmaddr = __gmap_translate(gmap, gaddr);
588 if (IS_ERR_VALUE(vmaddr)) {
589 rc = vmaddr;
590 goto out_up;
591 }
592 if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
593 &unlocked)) {
594 rc = -EFAULT;
595 goto out_up;
596 }
597 /*
598 * In the case that fixup_user_fault unlocked the mmap_sem during
599 * faultin redo __gmap_translate to not race with a map/unmap_segment.
600 */
601 if (unlocked)
602 goto retry;
603
604 rc = __gmap_link(gmap, gaddr, vmaddr);
605out_up:
606 up_read(&gmap->mm->mmap_sem);
607 return rc;
608}
609EXPORT_SYMBOL_GPL(gmap_fault);
610
611static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm)
612{
613 if (!non_swap_entry(entry))
614 dec_mm_counter(mm, MM_SWAPENTS);
615 else if (is_migration_entry(entry)) {
616 struct page *page = migration_entry_to_page(entry);
617
618 dec_mm_counter(mm, mm_counter(page));
619 }
620 free_swap_and_cache(entry);
621}
622
623/*
624 * this function is assumed to be called with mmap_sem held
625 */
626void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
627{
628 unsigned long vmaddr, ptev, pgstev;
629 pte_t *ptep, pte;
630 spinlock_t *ptl;
631 pgste_t pgste;
632
633 /* Find the vm address for the guest address */
634 vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
635 gaddr >> PMD_SHIFT);
636 if (!vmaddr)
637 return;
638 vmaddr |= gaddr & ~PMD_MASK;
639 /* Get pointer to the page table entry */
640 ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
641 if (unlikely(!ptep))
642 return;
643 pte = *ptep;
644 if (!pte_swap(pte))
645 goto out_pte;
646 /* Zap unused and logically-zero pages */
647 pgste = pgste_get_lock(ptep);
648 pgstev = pgste_val(pgste);
649 ptev = pte_val(pte);
650 if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
651 ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) {
652 gmap_zap_swap_entry(pte_to_swp_entry(pte), gmap->mm);
653 pte_clear(gmap->mm, vmaddr, ptep);
654 }
655 pgste_set_unlock(ptep, pgste);
656out_pte:
657 pte_unmap_unlock(ptep, ptl);
658}
659EXPORT_SYMBOL_GPL(__gmap_zap);
660
661void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
662{
663 unsigned long gaddr, vmaddr, size;
664 struct vm_area_struct *vma;
665
666 down_read(&gmap->mm->mmap_sem);
667 for (gaddr = from; gaddr < to;
668 gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
669 /* Find the vm address for the guest address */
670 vmaddr = (unsigned long)
671 radix_tree_lookup(&gmap->guest_to_host,
672 gaddr >> PMD_SHIFT);
673 if (!vmaddr)
674 continue;
675 vmaddr |= gaddr & ~PMD_MASK;
676 /* Find vma in the parent mm */
677 vma = find_vma(gmap->mm, vmaddr);
678 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
679 zap_page_range(vma, vmaddr, size, NULL);
680 }
681 up_read(&gmap->mm->mmap_sem);
682}
683EXPORT_SYMBOL_GPL(gmap_discard);
684
685static LIST_HEAD(gmap_notifier_list);
686static DEFINE_SPINLOCK(gmap_notifier_lock);
687
688/**
689 * gmap_register_ipte_notifier - register a pte invalidation callback
690 * @nb: pointer to the gmap notifier block
691 */
692void gmap_register_ipte_notifier(struct gmap_notifier *nb)
693{
694 spin_lock(&gmap_notifier_lock);
695 list_add(&nb->list, &gmap_notifier_list);
696 spin_unlock(&gmap_notifier_lock);
697}
698EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
699
700/**
701 * gmap_unregister_ipte_notifier - remove a pte invalidation callback
702 * @nb: pointer to the gmap notifier block
703 */
704void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
705{
706 spin_lock(&gmap_notifier_lock);
707 list_del_init(&nb->list);
708 spin_unlock(&gmap_notifier_lock);
709}
710EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
711
712/**
713 * gmap_ipte_notify - mark a range of ptes for invalidation notification
714 * @gmap: pointer to guest mapping meta data structure
715 * @gaddr: virtual address in the guest address space
716 * @len: size of area
717 *
718 * Returns 0 if for each page in the given range a gmap mapping exists and
719 * the invalidation notification could be set. If the gmap mapping is missing
720 * for one or more pages -EFAULT is returned. If no memory could be allocated
721 * -ENOMEM is returned. This function establishes missing page table entries.
722 */
723int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
724{
725 unsigned long addr;
726 spinlock_t *ptl;
727 pte_t *ptep, entry;
728 pgste_t pgste;
729 bool unlocked;
730 int rc = 0;
731
732 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
733 return -EINVAL;
734 down_read(&gmap->mm->mmap_sem);
735 while (len) {
736 unlocked = false;
737 /* Convert gmap address and connect the page tables */
738 addr = __gmap_translate(gmap, gaddr);
739 if (IS_ERR_VALUE(addr)) {
740 rc = addr;
741 break;
742 }
743 /* Get the page mapped */
744 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
745 &unlocked)) {
746 rc = -EFAULT;
747 break;
748 }
749 /* While trying to map mmap_sem got unlocked. Let us retry */
750 if (unlocked)
751 continue;
752 rc = __gmap_link(gmap, gaddr, addr);
753 if (rc)
754 break;
755 /* Walk the process page table, lock and get pte pointer */
756 ptep = get_locked_pte(gmap->mm, addr, &ptl);
757 VM_BUG_ON(!ptep);
758 /* Set notification bit in the pgste of the pte */
759 entry = *ptep;
760 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
761 pgste = pgste_get_lock(ptep);
762 pgste_val(pgste) |= PGSTE_IN_BIT;
763 pgste_set_unlock(ptep, pgste);
764 gaddr += PAGE_SIZE;
765 len -= PAGE_SIZE;
766 }
767 pte_unmap_unlock(ptep, ptl);
768 }
769 up_read(&gmap->mm->mmap_sem);
770 return rc;
771}
772EXPORT_SYMBOL_GPL(gmap_ipte_notify);
773
774/**
775 * ptep_ipte_notify - call all invalidation callbacks for a specific pte.
776 * @mm: pointer to the process mm_struct
777 * @addr: virtual address in the process address space
778 * @pte: pointer to the page table entry
779 *
780 * This function is assumed to be called with the page table lock held
781 * for the pte to notify.
782 */
783void ptep_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
784{
785 unsigned long offset, gaddr;
786 unsigned long *table;
787 struct gmap_notifier *nb;
788 struct gmap *gmap;
789
790 offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
791 offset = offset * (4096 / sizeof(pte_t));
792 spin_lock(&gmap_notifier_lock);
793 list_for_each_entry(gmap, &mm->context.gmap_list, list) {
794 table = radix_tree_lookup(&gmap->host_to_guest,
795 vmaddr >> PMD_SHIFT);
796 if (!table)
797 continue;
798 gaddr = __gmap_segment_gaddr(table) + offset;
799 list_for_each_entry(nb, &gmap_notifier_list, list)
800 nb->notifier_call(gmap, gaddr);
801 }
802 spin_unlock(&gmap_notifier_lock);
803}
804EXPORT_SYMBOL_GPL(ptep_ipte_notify);
805
806int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
807 unsigned long key, bool nq)
808{
809 spinlock_t *ptl;
810 pgste_t old, new;
811 pte_t *ptep;
812
813 down_read(&mm->mmap_sem);
814 ptep = get_locked_pte(mm, addr, &ptl);
815 if (unlikely(!ptep)) {
816 up_read(&mm->mmap_sem);
817 return -EFAULT;
818 }
819
820 new = old = pgste_get_lock(ptep);
821 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
822 PGSTE_ACC_BITS | PGSTE_FP_BIT);
823 pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
824 pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
825 if (!(pte_val(*ptep) & _PAGE_INVALID)) {
826 unsigned long address, bits, skey;
827
828 address = pte_val(*ptep) & PAGE_MASK;
829 skey = (unsigned long) page_get_storage_key(address);
830 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
831 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
832 /* Set storage key ACC and FP */
833 page_set_storage_key(address, skey, !nq);
834 /* Merge host changed & referenced into pgste */
835 pgste_val(new) |= bits << 52;
836 }
837 /* changing the guest storage key is considered a change of the page */
838 if ((pgste_val(new) ^ pgste_val(old)) &
839 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
840 pgste_val(new) |= PGSTE_UC_BIT;
841
842 pgste_set_unlock(ptep, new);
843 pte_unmap_unlock(ptep, ptl);
844 up_read(&mm->mmap_sem);
845 return 0;
846}
847EXPORT_SYMBOL(set_guest_storage_key);
848
849unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
850{
851 spinlock_t *ptl;
852 pgste_t pgste;
853 pte_t *ptep;
854 uint64_t physaddr;
855 unsigned long key = 0;
856
857 down_read(&mm->mmap_sem);
858 ptep = get_locked_pte(mm, addr, &ptl);
859 if (unlikely(!ptep)) {
860 up_read(&mm->mmap_sem);
861 return -EFAULT;
862 }
863 pgste = pgste_get_lock(ptep);
864
865 if (pte_val(*ptep) & _PAGE_INVALID) {
866 key |= (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
867 key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
868 key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48;
869 key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48;
870 } else {
871 physaddr = pte_val(*ptep) & PAGE_MASK;
872 key = page_get_storage_key(physaddr);
873
874 /* Reflect guest's logical view, not physical */
875 if (pgste_val(pgste) & PGSTE_GR_BIT)
876 key |= _PAGE_REFERENCED;
877 if (pgste_val(pgste) & PGSTE_GC_BIT)
878 key |= _PAGE_CHANGED;
879 }
880
881 pgste_set_unlock(ptep, pgste);
882 pte_unmap_unlock(ptep, ptl);
883 up_read(&mm->mmap_sem);
884 return key;
885}
886EXPORT_SYMBOL(get_guest_storage_key);
887
888static int page_table_allocate_pgste_min = 0;
889static int page_table_allocate_pgste_max = 1;
890int page_table_allocate_pgste = 0;
891EXPORT_SYMBOL(page_table_allocate_pgste);
892
893static struct ctl_table page_table_sysctl[] = {
894 {
895 .procname = "allocate_pgste",
896 .data = &page_table_allocate_pgste,
897 .maxlen = sizeof(int),
898 .mode = S_IRUGO | S_IWUSR,
899 .proc_handler = proc_dointvec,
900 .extra1 = &page_table_allocate_pgste_min,
901 .extra2 = &page_table_allocate_pgste_max,
902 },
903 { }
904};
905
906static struct ctl_table page_table_sysctl_dir[] = {
907 {
908 .procname = "vm",
909 .maxlen = 0,
910 .mode = 0555,
911 .child = page_table_sysctl,
912 },
913 { }
914};
915
916static int __init page_table_register_sysctl(void)
917{
918 return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
919}
920__initcall(page_table_register_sysctl);
921
922#else /* CONFIG_PGSTE */
923
924static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table,
925 unsigned long vmaddr)
926{
927}
928
929#endif /* CONFIG_PGSTE */
930
931static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
932{
933 unsigned int old, new;
934
935 do {
936 old = atomic_read(v);
937 new = old ^ bits;
938 } while (atomic_cmpxchg(v, old, new) != old);
939 return new;
940}
941
942/*
943 * page table entry allocation/free routines.
944 */
945unsigned long *page_table_alloc(struct mm_struct *mm)
946{
947 unsigned long *table;
948 struct page *page;
949 unsigned int mask, bit;
950
951 /* Try to get a fragment of a 4K page as a 2K page table */
952 if (!mm_alloc_pgste(mm)) {
953 table = NULL;
954 spin_lock_bh(&mm->context.list_lock);
955 if (!list_empty(&mm->context.pgtable_list)) {
956 page = list_first_entry(&mm->context.pgtable_list,
957 struct page, lru);
958 mask = atomic_read(&page->_mapcount);
959 mask = (mask | (mask >> 4)) & 3;
960 if (mask != 3) {
961 table = (unsigned long *) page_to_phys(page);
962 bit = mask & 1; /* =1 -> second 2K */
963 if (bit)
964 table += PTRS_PER_PTE;
965 atomic_xor_bits(&page->_mapcount, 1U << bit);
966 list_del(&page->lru);
967 }
968 }
969 spin_unlock_bh(&mm->context.list_lock);
970 if (table)
971 return table;
972 }
973 /* Allocate a fresh page */
974 page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
975 if (!page)
976 return NULL;
977 if (!pgtable_page_ctor(page)) {
978 __free_page(page);
979 return NULL;
980 }
981 /* Initialize page table */
982 table = (unsigned long *) page_to_phys(page);
983 if (mm_alloc_pgste(mm)) {
984 /* Return 4K page table with PGSTEs */
985 atomic_set(&page->_mapcount, 3);
986 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
987 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
988 } else {
989 /* Return the first 2K fragment of the page */
990 atomic_set(&page->_mapcount, 1);
991 clear_table(table, _PAGE_INVALID, PAGE_SIZE);
992 spin_lock_bh(&mm->context.list_lock);
993 list_add(&page->lru, &mm->context.pgtable_list);
994 spin_unlock_bh(&mm->context.list_lock);
995 }
996 return table;
997}
998
999void page_table_free(struct mm_struct *mm, unsigned long *table)
1000{
1001 struct page *page;
1002 unsigned int bit, mask;
1003
1004 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1005 if (!mm_alloc_pgste(mm)) {
1006 /* Free 2K page table fragment of a 4K page */
1007 bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
1008 spin_lock_bh(&mm->context.list_lock);
1009 mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
1010 if (mask & 3)
1011 list_add(&page->lru, &mm->context.pgtable_list);
1012 else
1013 list_del(&page->lru);
1014 spin_unlock_bh(&mm->context.list_lock);
1015 if (mask != 0)
1016 return;
1017 }
1018
1019 pgtable_page_dtor(page);
1020 atomic_set(&page->_mapcount, -1);
1021 __free_page(page);
1022}
1023
1024void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
1025 unsigned long vmaddr)
1026{
1027 struct mm_struct *mm;
1028 struct page *page;
1029 unsigned int bit, mask;
1030
1031 mm = tlb->mm;
1032 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1033 if (mm_alloc_pgste(mm)) {
1034 gmap_unlink(mm, table, vmaddr);
1035 table = (unsigned long *) (__pa(table) | 3);
1036 tlb_remove_table(tlb, table);
1037 return;
1038 }
1039 bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
1040 spin_lock_bh(&mm->context.list_lock);
1041 mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
1042 if (mask & 3)
1043 list_add_tail(&page->lru, &mm->context.pgtable_list);
1044 else
1045 list_del(&page->lru);
1046 spin_unlock_bh(&mm->context.list_lock);
1047 table = (unsigned long *) (__pa(table) | (1U << bit));
1048 tlb_remove_table(tlb, table);
1049}
1050
1051static void __tlb_remove_table(void *_table)
1052{
1053 unsigned int mask = (unsigned long) _table & 3;
1054 void *table = (void *)((unsigned long) _table ^ mask);
1055 struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1056
1057 switch (mask) {
1058 case 0: /* pmd or pud */
1059 free_pages((unsigned long) table, 2);
1060 break;
1061 case 1: /* lower 2K of a 4K page table */
1062 case 2: /* higher 2K of a 4K page table */
1063 if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0)
1064 break;
1065 /* fallthrough */
1066 case 3: /* 4K page table with pgstes */
1067 pgtable_page_dtor(page);
1068 atomic_set(&page->_mapcount, -1);
1069 __free_page(page);
1070 break;
1071 }
1072}
1073
1074static void tlb_remove_table_smp_sync(void *arg)
1075{
1076 /* Simply deliver the interrupt */
1077}
1078
1079static void tlb_remove_table_one(void *table)
1080{
1081 /*
1082 * This isn't an RCU grace period and hence the page-tables cannot be
1083 * assumed to be actually RCU-freed.
1084 *
1085 * It is however sufficient for software page-table walkers that rely
1086 * on IRQ disabling. See the comment near struct mmu_table_batch.
1087 */
1088 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
1089 __tlb_remove_table(table);
1090}
1091
1092static void tlb_remove_table_rcu(struct rcu_head *head)
1093{
1094 struct mmu_table_batch *batch;
1095 int i;
1096
1097 batch = container_of(head, struct mmu_table_batch, rcu);
1098
1099 for (i = 0; i < batch->nr; i++)
1100 __tlb_remove_table(batch->tables[i]);
1101
1102 free_page((unsigned long)batch);
1103}
1104
1105void tlb_table_flush(struct mmu_gather *tlb)
1106{
1107 struct mmu_table_batch **batch = &tlb->batch;
1108
1109 if (*batch) {
1110 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
1111 *batch = NULL;
1112 }
1113}
1114
1115void tlb_remove_table(struct mmu_gather *tlb, void *table)
1116{
1117 struct mmu_table_batch **batch = &tlb->batch;
1118
1119 tlb->mm->context.flush_mm = 1;
1120 if (*batch == NULL) {
1121 *batch = (struct mmu_table_batch *)
1122 __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
1123 if (*batch == NULL) {
1124 __tlb_flush_mm_lazy(tlb->mm);
1125 tlb_remove_table_one(table);
1126 return;
1127 }
1128 (*batch)->nr = 0;
1129 }
1130 (*batch)->tables[(*batch)->nr++] = table;
1131 if ((*batch)->nr == MAX_TABLE_BATCH)
1132 tlb_flush_mmu(tlb);
1133}
1134
1135#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1136static inline void thp_split_vma(struct vm_area_struct *vma)
1137{
1138 unsigned long addr;
1139
1140 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
1141 follow_page(vma, addr, FOLL_SPLIT);
1142}
1143
1144static inline void thp_split_mm(struct mm_struct *mm)
1145{
1146 struct vm_area_struct *vma;
1147
1148 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
1149 thp_split_vma(vma);
1150 vma->vm_flags &= ~VM_HUGEPAGE;
1151 vma->vm_flags |= VM_NOHUGEPAGE;
1152 }
1153 mm->def_flags |= VM_NOHUGEPAGE;
1154}
1155#else
1156static inline void thp_split_mm(struct mm_struct *mm)
1157{
1158}
1159#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1160
1161static inline pte_t ptep_flush_direct(struct mm_struct *mm, 27static inline pte_t ptep_flush_direct(struct mm_struct *mm,
1162 unsigned long addr, pte_t *ptep) 28 unsigned long addr, pte_t *ptep)
1163{ 29{
@@ -1198,6 +64,55 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
1198 return old; 64 return old;
1199} 65}
1200 66
67static inline pgste_t pgste_get_lock(pte_t *ptep)
68{
69 unsigned long new = 0;
70#ifdef CONFIG_PGSTE
71 unsigned long old;
72
73 preempt_disable();
74 asm(
75 " lg %0,%2\n"
76 "0: lgr %1,%0\n"
77 " nihh %0,0xff7f\n" /* clear PCL bit in old */
78 " oihh %1,0x0080\n" /* set PCL bit in new */
79 " csg %0,%1,%2\n"
80 " jl 0b\n"
81 : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE])
82 : "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory");
83#endif
84 return __pgste(new);
85}
86
87static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste)
88{
89#ifdef CONFIG_PGSTE
90 asm(
91 " nihh %1,0xff7f\n" /* clear PCL bit */
92 " stg %1,%0\n"
93 : "=Q" (ptep[PTRS_PER_PTE])
94 : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE])
95 : "cc", "memory");
96 preempt_enable();
97#endif
98}
99
100static inline pgste_t pgste_get(pte_t *ptep)
101{
102 unsigned long pgste = 0;
103#ifdef CONFIG_PGSTE
104 pgste = *(unsigned long *)(ptep + PTRS_PER_PTE);
105#endif
106 return __pgste(pgste);
107}
108
109static inline void pgste_set(pte_t *ptep, pgste_t pgste)
110{
111#ifdef CONFIG_PGSTE
112 *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste;
113#endif
114}
115
1201static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, 116static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste,
1202 struct mm_struct *mm) 117 struct mm_struct *mm)
1203{ 118{
@@ -1271,63 +186,12 @@ static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
1271#ifdef CONFIG_PGSTE 186#ifdef CONFIG_PGSTE
1272 if (pgste_val(pgste) & PGSTE_IN_BIT) { 187 if (pgste_val(pgste) & PGSTE_IN_BIT) {
1273 pgste_val(pgste) &= ~PGSTE_IN_BIT; 188 pgste_val(pgste) &= ~PGSTE_IN_BIT;
1274 ptep_ipte_notify(mm, addr, ptep); 189 ptep_notify(mm, addr, ptep);
1275 } 190 }
1276#endif 191#endif
1277 return pgste; 192 return pgste;
1278} 193}
1279 194
1280#ifdef CONFIG_PGSTE
1281/*
1282 * Test and reset if a guest page is dirty
1283 */
1284bool pgste_test_and_clear_dirty(struct mm_struct *mm, unsigned long addr)
1285{
1286 spinlock_t *ptl;
1287 pgste_t pgste;
1288 pte_t *ptep;
1289 pte_t pte;
1290 bool dirty;
1291
1292 ptep = get_locked_pte(mm, addr, &ptl);
1293 if (unlikely(!ptep))
1294 return false;
1295
1296 pgste = pgste_get_lock(ptep);
1297 dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
1298 pgste_val(pgste) &= ~PGSTE_UC_BIT;
1299 pte = *ptep;
1300 if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
1301 pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
1302 __ptep_ipte(addr, ptep);
1303 if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
1304 pte_val(pte) |= _PAGE_PROTECT;
1305 else
1306 pte_val(pte) |= _PAGE_INVALID;
1307 *ptep = pte;
1308 }
1309 pgste_set_unlock(ptep, pgste);
1310
1311 spin_unlock(ptl);
1312 return dirty;
1313}
1314EXPORT_SYMBOL_GPL(pgste_test_and_clear_dirty);
1315
1316void set_pte_pgste_at(struct mm_struct *mm, unsigned long addr,
1317 pte_t *ptep, pte_t entry)
1318{
1319 pgste_t pgste;
1320
1321 /* the mm_has_pgste() check is done in set_pte_at() */
1322 pgste = pgste_get_lock(ptep);
1323 pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
1324 pgste_set_key(ptep, pgste, entry, mm);
1325 pgste = pgste_set_pte(ptep, pgste, entry);
1326 pgste_set_unlock(ptep, pgste);
1327}
1328EXPORT_SYMBOL(set_pte_pgste_at);
1329#endif
1330
1331static inline pgste_t ptep_xchg_start(struct mm_struct *mm, 195static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
1332 unsigned long addr, pte_t *ptep) 196 unsigned long addr, pte_t *ptep)
1333{ 197{
@@ -1486,112 +350,6 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
1486} 350}
1487EXPORT_SYMBOL(pmdp_xchg_lazy); 351EXPORT_SYMBOL(pmdp_xchg_lazy);
1488 352
1489/*
1490 * switch on pgstes for its userspace process (for kvm)
1491 */
1492int s390_enable_sie(void)
1493{
1494 struct mm_struct *mm = current->mm;
1495
1496 /* Do we have pgstes? if yes, we are done */
1497 if (mm_has_pgste(mm))
1498 return 0;
1499 /* Fail if the page tables are 2K */
1500 if (!mm_alloc_pgste(mm))
1501 return -EINVAL;
1502 down_write(&mm->mmap_sem);
1503 mm->context.has_pgste = 1;
1504 /* split thp mappings and disable thp for future mappings */
1505 thp_split_mm(mm);
1506 up_write(&mm->mmap_sem);
1507 return 0;
1508}
1509EXPORT_SYMBOL_GPL(s390_enable_sie);
1510
1511/*
1512 * Enable storage key handling from now on and initialize the storage
1513 * keys with the default key.
1514 */
1515static int __s390_enable_skey(pte_t *pte, unsigned long addr,
1516 unsigned long next, struct mm_walk *walk)
1517{
1518 unsigned long ptev;
1519 pgste_t pgste;
1520
1521 /*
1522 * Remove all zero page mappings,
1523 * after establishing a policy to forbid zero page mappings
1524 * following faults for that page will get fresh anonymous pages
1525 */
1526 if (is_zero_pfn(pte_pfn(*pte)))
1527 ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID));
1528 /* Clear storage key */
1529 pgste = pgste_get_lock(pte);
1530 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
1531 PGSTE_GR_BIT | PGSTE_GC_BIT);
1532 ptev = pte_val(*pte);
1533 if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
1534 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
1535 pgste_set_unlock(pte, pgste);
1536 return 0;
1537}
1538
1539int s390_enable_skey(void)
1540{
1541 struct mm_walk walk = { .pte_entry = __s390_enable_skey };
1542 struct mm_struct *mm = current->mm;
1543 struct vm_area_struct *vma;
1544 int rc = 0;
1545
1546 down_write(&mm->mmap_sem);
1547 if (mm_use_skey(mm))
1548 goto out_up;
1549
1550 mm->context.use_skey = 1;
1551 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1552 if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
1553 MADV_UNMERGEABLE, &vma->vm_flags)) {
1554 mm->context.use_skey = 0;
1555 rc = -ENOMEM;
1556 goto out_up;
1557 }
1558 }
1559 mm->def_flags &= ~VM_MERGEABLE;
1560
1561 walk.mm = mm;
1562 walk_page_range(0, TASK_SIZE, &walk);
1563
1564out_up:
1565 up_write(&mm->mmap_sem);
1566 return rc;
1567}
1568EXPORT_SYMBOL_GPL(s390_enable_skey);
1569
1570/*
1571 * Reset CMMA state, make all pages stable again.
1572 */
1573static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
1574 unsigned long next, struct mm_walk *walk)
1575{
1576 pgste_t pgste;
1577
1578 pgste = pgste_get_lock(pte);
1579 pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
1580 pgste_set_unlock(pte, pgste);
1581 return 0;
1582}
1583
1584void s390_reset_cmma(struct mm_struct *mm)
1585{
1586 struct mm_walk walk = { .pte_entry = __s390_reset_cmma };
1587
1588 down_write(&mm->mmap_sem);
1589 walk.mm = mm;
1590 walk_page_range(0, TASK_SIZE, &walk);
1591 up_write(&mm->mmap_sem);
1592}
1593EXPORT_SYMBOL_GPL(s390_reset_cmma);
1594
1595#ifdef CONFIG_TRANSPARENT_HUGEPAGE 353#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1596void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 354void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1597 pgtable_t pgtable) 355 pgtable_t pgtable)
@@ -1632,3 +390,193 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1632 return pgtable; 390 return pgtable;
1633} 391}
1634#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 392#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
393
394#ifdef CONFIG_PGSTE
395void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
396 pte_t *ptep, pte_t entry)
397{
398 pgste_t pgste;
399
400 /* the mm_has_pgste() check is done in set_pte_at() */
401 pgste = pgste_get_lock(ptep);
402 pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
403 pgste_set_key(ptep, pgste, entry, mm);
404 pgste = pgste_set_pte(ptep, pgste, entry);
405 pgste_set_unlock(ptep, pgste);
406}
407
408void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
409{
410 pgste_t pgste;
411
412 pgste = pgste_get_lock(ptep);
413 pgste_val(pgste) |= PGSTE_IN_BIT;
414 pgste_set_unlock(ptep, pgste);
415}
416
417static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
418{
419 if (!non_swap_entry(entry))
420 dec_mm_counter(mm, MM_SWAPENTS);
421 else if (is_migration_entry(entry)) {
422 struct page *page = migration_entry_to_page(entry);
423
424 dec_mm_counter(mm, mm_counter(page));
425 }
426 free_swap_and_cache(entry);
427}
428
429void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
430 pte_t *ptep, int reset)
431{
432 unsigned long pgstev;
433 pgste_t pgste;
434 pte_t pte;
435
436 /* Zap unused and logically-zero pages */
437 pgste = pgste_get_lock(ptep);
438 pgstev = pgste_val(pgste);
439 pte = *ptep;
440 if (pte_swap(pte) &&
441 ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED ||
442 (pgstev & _PGSTE_GPS_ZERO))) {
443 ptep_zap_swap_entry(mm, pte_to_swp_entry(pte));
444 pte_clear(mm, addr, ptep);
445 }
446 if (reset)
447 pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
448 pgste_set_unlock(ptep, pgste);
449}
450
451void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
452{
453 unsigned long ptev;
454 pgste_t pgste;
455
456 /* Clear storage key */
457 pgste = pgste_get_lock(ptep);
458 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
459 PGSTE_GR_BIT | PGSTE_GC_BIT);
460 ptev = pte_val(*ptep);
461 if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
462 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
463 pgste_set_unlock(ptep, pgste);
464}
465
466/*
467 * Test and reset if a guest page is dirty
468 */
469bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
470{
471 spinlock_t *ptl;
472 pgste_t pgste;
473 pte_t *ptep;
474 pte_t pte;
475 bool dirty;
476
477 ptep = get_locked_pte(mm, addr, &ptl);
478 if (unlikely(!ptep))
479 return false;
480
481 pgste = pgste_get_lock(ptep);
482 dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
483 pgste_val(pgste) &= ~PGSTE_UC_BIT;
484 pte = *ptep;
485 if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
486 pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
487 __ptep_ipte(addr, ptep);
488 if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
489 pte_val(pte) |= _PAGE_PROTECT;
490 else
491 pte_val(pte) |= _PAGE_INVALID;
492 *ptep = pte;
493 }
494 pgste_set_unlock(ptep, pgste);
495
496 spin_unlock(ptl);
497 return dirty;
498}
499EXPORT_SYMBOL_GPL(test_and_clear_guest_dirty);
500
501int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
502 unsigned char key, bool nq)
503{
504 unsigned long keyul;
505 spinlock_t *ptl;
506 pgste_t old, new;
507 pte_t *ptep;
508
509 down_read(&mm->mmap_sem);
510 ptep = get_locked_pte(mm, addr, &ptl);
511 if (unlikely(!ptep)) {
512 up_read(&mm->mmap_sem);
513 return -EFAULT;
514 }
515
516 new = old = pgste_get_lock(ptep);
517 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
518 PGSTE_ACC_BITS | PGSTE_FP_BIT);
519 keyul = (unsigned long) key;
520 pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
521 pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
522 if (!(pte_val(*ptep) & _PAGE_INVALID)) {
523 unsigned long address, bits, skey;
524
525 address = pte_val(*ptep) & PAGE_MASK;
526 skey = (unsigned long) page_get_storage_key(address);
527 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
528 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
529 /* Set storage key ACC and FP */
530 page_set_storage_key(address, skey, !nq);
531 /* Merge host changed & referenced into pgste */
532 pgste_val(new) |= bits << 52;
533 }
534 /* changing the guest storage key is considered a change of the page */
535 if ((pgste_val(new) ^ pgste_val(old)) &
536 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
537 pgste_val(new) |= PGSTE_UC_BIT;
538
539 pgste_set_unlock(ptep, new);
540 pte_unmap_unlock(ptep, ptl);
541 up_read(&mm->mmap_sem);
542 return 0;
543}
544EXPORT_SYMBOL(set_guest_storage_key);
545
546unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
547{
548 unsigned char key;
549 spinlock_t *ptl;
550 pgste_t pgste;
551 pte_t *ptep;
552
553 down_read(&mm->mmap_sem);
554 ptep = get_locked_pte(mm, addr, &ptl);
555 if (unlikely(!ptep)) {
556 up_read(&mm->mmap_sem);
557 return -EFAULT;
558 }
559 pgste = pgste_get_lock(ptep);
560
561 if (pte_val(*ptep) & _PAGE_INVALID) {
562 key = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
563 key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
564 key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48;
565 key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48;
566 } else {
567 key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
568
569 /* Reflect guest's logical view, not physical */
570 if (pgste_val(pgste) & PGSTE_GR_BIT)
571 key |= _PAGE_REFERENCED;
572 if (pgste_val(pgste) & PGSTE_GC_BIT)
573 key |= _PAGE_CHANGED;
574 }
575
576 pgste_set_unlock(ptep, pgste);
577 pte_unmap_unlock(ptep, ptl);
578 up_read(&mm->mmap_sem);
579 return key;
580}
581EXPORT_SYMBOL(get_guest_storage_key);
582#endif