aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Schwidefsky <schwidefsky@de.ibm.com>2016-03-08 06:12:18 -0500
committerChristian Borntraeger <borntraeger@de.ibm.com>2016-06-20 03:54:04 -0400
commit4be130a08420d6918d80c1067f8078f425eb98df (patch)
treec3e323bf6597eea8e588586550e378acf7652ce2
parent6ea427bbbd4078297bb1dbd6c5cb83f3f48aac46 (diff)
s390/mm: add shadow gmap support
For a nested KVM guest the outer KVM host needs to create shadow page tables for the nested guest. This patch adds the basic support to the guest address space (gmap) code. For each guest address space the inner KVM host creates, the first outer KVM host needs to create shadow page tables. The address space is identified by the ASCE loaded into the control register 1 at the time the inner SIE instruction for the second nested KVM guest is executed. The outer KVM host creates the shadow tables starting with the table identified by the ASCE on a on-demand basis. The outer KVM host will get repeated faults for all the shadow tables needed to run the second KVM guest. While a shadow page table for the second KVM guest is active the access to the origin region, segment and page tables needs to be restricted for the first KVM guest. For region and segment and page tables the first KVM guest may read the memory, but write attempt has to lead to an unshadow. This is done using the page invalid and read-only bits in the page table of the first KVM guest. If the first guest re-accesses one of the origin pages of a shadow, it gets a fault and the affected parts of the shadow page table hierarchy needs to be removed again. PGSTE tables don't have to be shadowed, as all interpretation assist can't deal with the invalid bits in the shadow pte being set differently than the original ones provided by the first KVM guest. Many bug fixes and improvements by David Hildenbrand. Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
-rw-r--r--arch/s390/include/asm/gmap.h52
-rw-r--r--arch/s390/include/asm/pgalloc.h2
-rw-r--r--arch/s390/include/asm/pgtable.h10
-rw-r--r--arch/s390/include/asm/processor.h1
-rw-r--r--arch/s390/mm/fault.c1
-rw-r--r--arch/s390/mm/gmap.c1150
-rw-r--r--arch/s390/mm/pgalloc.c23
-rw-r--r--arch/s390/mm/pgtable.c57
8 files changed, 1262 insertions, 34 deletions
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index e69853ce55da..58e65ee5b2d2 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -10,6 +10,7 @@
10 10
11/** 11/**
12 * struct gmap_struct - guest address space 12 * struct gmap_struct - guest address space
13 * @list: list head for the mm->context gmap list
13 * @crst_list: list of all crst tables used in the guest address space 14 * @crst_list: list of all crst tables used in the guest address space
14 * @mm: pointer to the parent mm_struct 15 * @mm: pointer to the parent mm_struct
15 * @guest_to_host: radix tree with guest to host address translation 16 * @guest_to_host: radix tree with guest to host address translation
@@ -19,6 +20,13 @@
19 * @table: pointer to the page directory 20 * @table: pointer to the page directory
20 * @asce: address space control element for gmap page table 21 * @asce: address space control element for gmap page table
21 * @pfault_enabled: defines if pfaults are applicable for the guest 22 * @pfault_enabled: defines if pfaults are applicable for the guest
23 * @host_to_rmap: radix tree with gmap_rmap lists
24 * @children: list of shadow gmap structures
25 * @pt_list: list of all page tables used in the shadow guest address space
26 * @shadow_lock: spinlock to protect the shadow gmap list
27 * @parent: pointer to the parent gmap for shadow guest address spaces
28 * @orig_asce: ASCE for which the shadow page table has been created
29 * @removed: flag to indicate if a shadow guest address space has been removed
22 */ 30 */
23struct gmap { 31struct gmap {
24 struct list_head list; 32 struct list_head list;
@@ -33,9 +41,33 @@ struct gmap {
33 unsigned long asce_end; 41 unsigned long asce_end;
34 void *private; 42 void *private;
35 bool pfault_enabled; 43 bool pfault_enabled;
44 /* Additional data for shadow guest address spaces */
45 struct radix_tree_root host_to_rmap;
46 struct list_head children;
47 struct list_head pt_list;
48 spinlock_t shadow_lock;
49 struct gmap *parent;
50 unsigned long orig_asce;
51 bool removed;
36}; 52};
37 53
38/** 54/**
55 * struct gmap_rmap - reverse mapping for shadow page table entries
56 * @next: pointer to next rmap in the list
57 * @raddr: virtual rmap address in the shadow guest address space
58 */
59struct gmap_rmap {
60 struct gmap_rmap *next;
61 unsigned long raddr;
62};
63
64#define gmap_for_each_rmap(pos, head) \
65 for (pos = (head); pos; pos = pos->next)
66
67#define gmap_for_each_rmap_safe(pos, n, head) \
68 for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n)
69
70/**
39 * struct gmap_notifier - notify function block for page invalidation 71 * struct gmap_notifier - notify function block for page invalidation
40 * @notifier_call: address of callback function 72 * @notifier_call: address of callback function
41 */ 73 */
@@ -46,6 +78,11 @@ struct gmap_notifier {
46 unsigned long end); 78 unsigned long end);
47}; 79};
48 80
81static inline int gmap_is_shadow(struct gmap *gmap)
82{
83 return !!gmap->parent;
84}
85
49struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit); 86struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
50void gmap_remove(struct gmap *gmap); 87void gmap_remove(struct gmap *gmap);
51struct gmap *gmap_get(struct gmap *gmap); 88struct gmap *gmap_get(struct gmap *gmap);
@@ -64,9 +101,22 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
64void __gmap_zap(struct gmap *, unsigned long gaddr); 101void __gmap_zap(struct gmap *, unsigned long gaddr);
65void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); 102void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
66 103
104int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
105
106struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce);
107int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t);
108int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t);
109int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt);
110int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt);
111int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
112 unsigned long *pgt, int *dat_protection);
113int gmap_shadow_page(struct gmap *sg, unsigned long saddr,
114 unsigned long paddr, int write);
115
67void gmap_register_pte_notifier(struct gmap_notifier *); 116void gmap_register_pte_notifier(struct gmap_notifier *);
68void gmap_unregister_pte_notifier(struct gmap_notifier *); 117void gmap_unregister_pte_notifier(struct gmap_notifier *);
69void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *); 118void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
119 unsigned long bits);
70 120
71int gmap_mprotect_notify(struct gmap *, unsigned long start, 121int gmap_mprotect_notify(struct gmap *, unsigned long start,
72 unsigned long len, int prot); 122 unsigned long len, int prot);
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index da34cb6b1f3b..f4eb9843eed4 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -19,8 +19,10 @@ unsigned long *crst_table_alloc(struct mm_struct *);
19void crst_table_free(struct mm_struct *, unsigned long *); 19void crst_table_free(struct mm_struct *, unsigned long *);
20 20
21unsigned long *page_table_alloc(struct mm_struct *); 21unsigned long *page_table_alloc(struct mm_struct *);
22struct page *page_table_alloc_pgste(struct mm_struct *mm);
22void page_table_free(struct mm_struct *, unsigned long *); 23void page_table_free(struct mm_struct *, unsigned long *);
23void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); 24void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
25void page_table_free_pgste(struct page *page);
24extern int page_table_allocate_pgste; 26extern int page_table_allocate_pgste;
25 27
26static inline void clear_table(unsigned long *s, unsigned long val, size_t n) 28static inline void clear_table(unsigned long *s, unsigned long val, size_t n)
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 35dde6afffcf..a6e7fc8f5b49 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -256,6 +256,7 @@ static inline int is_module_addr(void *addr)
256/* Bits in the region table entry */ 256/* Bits in the region table entry */
257#define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */ 257#define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */
258#define _REGION_ENTRY_PROTECT 0x200 /* region protection bit */ 258#define _REGION_ENTRY_PROTECT 0x200 /* region protection bit */
259#define _REGION_ENTRY_OFFSET 0xc0 /* region table offset */
259#define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */ 260#define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */
260#define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */ 261#define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */
261#define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */ 262#define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */
@@ -327,6 +328,7 @@ static inline int is_module_addr(void *addr)
327#define PGSTE_GC_BIT 0x0002000000000000UL 328#define PGSTE_GC_BIT 0x0002000000000000UL
328#define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */ 329#define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */
329#define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */ 330#define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */
331#define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */
330 332
331/* Guest Page State used for virtualization */ 333/* Guest Page State used for virtualization */
332#define _PGSTE_GPS_ZERO 0x0000000080000000UL 334#define _PGSTE_GPS_ZERO 0x0000000080000000UL
@@ -885,12 +887,16 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
885void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, 887void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
886 pte_t *ptep, pte_t entry); 888 pte_t *ptep, pte_t entry);
887void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 889void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
888void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 890void ptep_notify(struct mm_struct *mm, unsigned long addr,
891 pte_t *ptep, unsigned long bits);
889int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr, 892int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
890 pte_t *ptep, int prot); 893 pte_t *ptep, int prot, unsigned long bit);
891void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, 894void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
892 pte_t *ptep , int reset); 895 pte_t *ptep , int reset);
893void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 896void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
897int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
898 pte_t *sptep, pte_t *tptep, int write);
899void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
894 900
895bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address); 901bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
896int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 902int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 9d4d311d7e52..94c80b6d031d 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -109,6 +109,7 @@ struct thread_struct {
109 unsigned long ksp; /* kernel stack pointer */ 109 unsigned long ksp; /* kernel stack pointer */
110 mm_segment_t mm_segment; 110 mm_segment_t mm_segment;
111 unsigned long gmap_addr; /* address of last gmap fault. */ 111 unsigned long gmap_addr; /* address of last gmap fault. */
112 unsigned int gmap_write_flag; /* gmap fault write indication */
112 unsigned int gmap_pfault; /* signal of a pending guest pfault */ 113 unsigned int gmap_pfault; /* signal of a pending guest pfault */
113 struct per_regs per_user; /* User specified PER registers */ 114 struct per_regs per_user; /* User specified PER registers */
114 struct per_event per_event; /* Cause of the last PER trap */ 115 struct per_event per_event; /* Cause of the last PER trap */
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 19288c1b36d3..b84416c11c43 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -418,6 +418,7 @@ static inline int do_exception(struct pt_regs *regs, int access)
418 (struct gmap *) S390_lowcore.gmap : NULL; 418 (struct gmap *) S390_lowcore.gmap : NULL;
419 if (gmap) { 419 if (gmap) {
420 current->thread.gmap_addr = address; 420 current->thread.gmap_addr = address;
421 current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
421 address = __gmap_translate(gmap, address); 422 address = __gmap_translate(gmap, address);
422 if (address == -EFAULT) { 423 if (address == -EFAULT) {
423 fault = VM_FAULT_BADMAP; 424 fault = VM_FAULT_BADMAP;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index fe25f1915800..6695a09a3885 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -55,9 +55,13 @@ static struct gmap *gmap_alloc(unsigned long limit)
55 if (!gmap) 55 if (!gmap)
56 goto out; 56 goto out;
57 INIT_LIST_HEAD(&gmap->crst_list); 57 INIT_LIST_HEAD(&gmap->crst_list);
58 INIT_LIST_HEAD(&gmap->children);
59 INIT_LIST_HEAD(&gmap->pt_list);
58 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); 60 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
59 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); 61 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
62 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
60 spin_lock_init(&gmap->guest_table_lock); 63 spin_lock_init(&gmap->guest_table_lock);
64 spin_lock_init(&gmap->shadow_lock);
61 atomic_set(&gmap->ref_count, 1); 65 atomic_set(&gmap->ref_count, 1);
62 page = alloc_pages(GFP_KERNEL, 2); 66 page = alloc_pages(GFP_KERNEL, 2);
63 if (!page) 67 if (!page)
@@ -132,9 +136,38 @@ static void gmap_radix_tree_free(struct radix_tree_root *root)
132 } while (nr > 0); 136 } while (nr > 0);
133} 137}
134 138
139static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
140{
141 struct gmap_rmap *rmap, *rnext, *head;
142 struct radix_tree_iter iter;
143 unsigned long indices[16];
144 unsigned long index;
145 void **slot;
146 int i, nr;
147
148 /* A radix tree is freed by deleting all of its entries */
149 index = 0;
150 do {
151 nr = 0;
152 radix_tree_for_each_slot(slot, root, &iter, index) {
153 indices[nr] = iter.index;
154 if (++nr == 16)
155 break;
156 }
157 for (i = 0; i < nr; i++) {
158 index = indices[i];
159 head = radix_tree_delete(root, index);
160 gmap_for_each_rmap_safe(rmap, rnext, head)
161 kfree(rmap);
162 }
163 } while (nr > 0);
164}
165
135/** 166/**
136 * gmap_free - free a guest address space 167 * gmap_free - free a guest address space
137 * @gmap: pointer to the guest address space structure 168 * @gmap: pointer to the guest address space structure
169 *
170 * No locks required. There are no references to this gmap anymore.
138 */ 171 */
139static void gmap_free(struct gmap *gmap) 172static void gmap_free(struct gmap *gmap)
140{ 173{
@@ -145,6 +178,17 @@ static void gmap_free(struct gmap *gmap)
145 __free_pages(page, 2); 178 __free_pages(page, 2);
146 gmap_radix_tree_free(&gmap->guest_to_host); 179 gmap_radix_tree_free(&gmap->guest_to_host);
147 gmap_radix_tree_free(&gmap->host_to_guest); 180 gmap_radix_tree_free(&gmap->host_to_guest);
181
182 /* Free additional data for a shadow gmap */
183 if (gmap_is_shadow(gmap)) {
184 /* Free all page tables. */
185 list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
186 page_table_free_pgste(page);
187 gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
188 /* Release reference to the parent */
189 gmap_put(gmap->parent);
190 }
191
148 kfree(gmap); 192 kfree(gmap);
149} 193}
150 194
@@ -180,8 +224,20 @@ EXPORT_SYMBOL_GPL(gmap_put);
180 */ 224 */
181void gmap_remove(struct gmap *gmap) 225void gmap_remove(struct gmap *gmap)
182{ 226{
227 struct gmap *sg, *next;
228
183 /* Flush tlb. */ 229 /* Flush tlb. */
184 gmap_flush_tlb(gmap); 230 gmap_flush_tlb(gmap);
231 /* Remove all shadow gmaps linked to this gmap */
232 if (!list_empty(&gmap->children)) {
233 spin_lock(&gmap->shadow_lock);
234 list_for_each_entry_safe(sg, next, &gmap->children, list) {
235 gmap_flush_tlb(sg);
236 list_del(&sg->list);
237 gmap_put(sg);
238 }
239 spin_unlock(&gmap->shadow_lock);
240 }
185 /* Remove gmap from the pre-mm list */ 241 /* Remove gmap from the pre-mm list */
186 spin_lock(&gmap->mm->context.gmap_lock); 242 spin_lock(&gmap->mm->context.gmap_lock);
187 list_del_rcu(&gmap->list); 243 list_del_rcu(&gmap->list);
@@ -227,7 +283,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
227 return -ENOMEM; 283 return -ENOMEM;
228 new = (unsigned long *) page_to_phys(page); 284 new = (unsigned long *) page_to_phys(page);
229 crst_table_init(new, init); 285 crst_table_init(new, init);
230 spin_lock(&gmap->mm->page_table_lock); 286 spin_lock(&gmap->guest_table_lock);
231 if (*table & _REGION_ENTRY_INVALID) { 287 if (*table & _REGION_ENTRY_INVALID) {
232 list_add(&page->lru, &gmap->crst_list); 288 list_add(&page->lru, &gmap->crst_list);
233 *table = (unsigned long) new | _REGION_ENTRY_LENGTH | 289 *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
@@ -235,7 +291,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
235 page->index = gaddr; 291 page->index = gaddr;
236 page = NULL; 292 page = NULL;
237 } 293 }
238 spin_unlock(&gmap->mm->page_table_lock); 294 spin_unlock(&gmap->guest_table_lock);
239 if (page) 295 if (page)
240 __free_pages(page, 2); 296 __free_pages(page, 2);
241 return 0; 297 return 0;
@@ -271,6 +327,7 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
271 unsigned long *entry; 327 unsigned long *entry;
272 int flush = 0; 328 int flush = 0;
273 329
330 BUG_ON(gmap_is_shadow(gmap));
274 spin_lock(&gmap->guest_table_lock); 331 spin_lock(&gmap->guest_table_lock);
275 entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 332 entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
276 if (entry) { 333 if (entry) {
@@ -310,6 +367,7 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
310 unsigned long off; 367 unsigned long off;
311 int flush; 368 int flush;
312 369
370 BUG_ON(gmap_is_shadow(gmap));
313 if ((to | len) & (PMD_SIZE - 1)) 371 if ((to | len) & (PMD_SIZE - 1))
314 return -EINVAL; 372 return -EINVAL;
315 if (len == 0 || to + len < to) 373 if (len == 0 || to + len < to)
@@ -341,6 +399,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
341 unsigned long off; 399 unsigned long off;
342 int flush; 400 int flush;
343 401
402 BUG_ON(gmap_is_shadow(gmap));
344 if ((from | to | len) & (PMD_SIZE - 1)) 403 if ((from | to | len) & (PMD_SIZE - 1))
345 return -EINVAL; 404 return -EINVAL;
346 if (len == 0 || from + len < from || to + len < to || 405 if (len == 0 || from + len < from || to + len < to ||
@@ -378,6 +437,8 @@ EXPORT_SYMBOL_GPL(gmap_map_segment);
378 * This function does not establish potentially missing page table entries. 437 * This function does not establish potentially missing page table entries.
379 * The mmap_sem of the mm that belongs to the address space must be held 438 * The mmap_sem of the mm that belongs to the address space must be held
380 * when this function gets called. 439 * when this function gets called.
440 *
441 * Note: Can also be called for shadow gmaps.
381 */ 442 */
382unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) 443unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
383{ 444{
@@ -385,6 +446,7 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
385 446
386 vmaddr = (unsigned long) 447 vmaddr = (unsigned long)
387 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); 448 radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
449 /* Note: guest_to_host is empty for a shadow gmap */
388 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; 450 return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
389} 451}
390EXPORT_SYMBOL_GPL(__gmap_translate); 452EXPORT_SYMBOL_GPL(__gmap_translate);
@@ -451,6 +513,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
451 pmd_t *pmd; 513 pmd_t *pmd;
452 int rc; 514 int rc;
453 515
516 BUG_ON(gmap_is_shadow(gmap));
454 /* Create higher level tables in the gmap page table */ 517 /* Create higher level tables in the gmap page table */
455 table = gmap->table; 518 table = gmap->table;
456 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { 519 if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
@@ -646,36 +709,65 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
646 * gmap_table_walk - walk the gmap page tables 709 * gmap_table_walk - walk the gmap page tables
647 * @gmap: pointer to guest mapping meta data structure 710 * @gmap: pointer to guest mapping meta data structure
648 * @gaddr: virtual address in the guest address space 711 * @gaddr: virtual address in the guest address space
712 * @level: page table level to stop at
713 *
714 * Returns a table entry pointer for the given guest address and @level
715 * @level=0 : returns a pointer to a page table table entry (or NULL)
716 * @level=1 : returns a pointer to a segment table entry (or NULL)
717 * @level=2 : returns a pointer to a region-3 table entry (or NULL)
718 * @level=3 : returns a pointer to a region-2 table entry (or NULL)
719 * @level=4 : returns a pointer to a region-1 table entry (or NULL)
720 *
721 * Returns NULL if the gmap page tables could not be walked to the
722 * requested level.
649 * 723 *
650 * Returns a table pointer for the given guest address. 724 * Note: Can also be called for shadow gmaps.
651 */ 725 */
652static inline unsigned long *gmap_table_walk(struct gmap *gmap, 726static inline unsigned long *gmap_table_walk(struct gmap *gmap,
653 unsigned long gaddr) 727 unsigned long gaddr, int level)
654{ 728{
655 unsigned long *table; 729 unsigned long *table;
656 730
731 if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
732 return NULL;
733 if (gmap_is_shadow(gmap) && gmap->removed)
734 return NULL;
735 if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
736 return NULL;
657 table = gmap->table; 737 table = gmap->table;
658 switch (gmap->asce & _ASCE_TYPE_MASK) { 738 switch (gmap->asce & _ASCE_TYPE_MASK) {
659 case _ASCE_TYPE_REGION1: 739 case _ASCE_TYPE_REGION1:
660 table += (gaddr >> 53) & 0x7ff; 740 table += (gaddr >> 53) & 0x7ff;
741 if (level == 4)
742 break;
661 if (*table & _REGION_ENTRY_INVALID) 743 if (*table & _REGION_ENTRY_INVALID)
662 return NULL; 744 return NULL;
663 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 745 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
664 /* Fallthrough */ 746 /* Fallthrough */
665 case _ASCE_TYPE_REGION2: 747 case _ASCE_TYPE_REGION2:
666 table += (gaddr >> 42) & 0x7ff; 748 table += (gaddr >> 42) & 0x7ff;
749 if (level == 3)
750 break;
667 if (*table & _REGION_ENTRY_INVALID) 751 if (*table & _REGION_ENTRY_INVALID)
668 return NULL; 752 return NULL;
669 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 753 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
670 /* Fallthrough */ 754 /* Fallthrough */
671 case _ASCE_TYPE_REGION3: 755 case _ASCE_TYPE_REGION3:
672 table += (gaddr >> 31) & 0x7ff; 756 table += (gaddr >> 31) & 0x7ff;
757 if (level == 2)
758 break;
673 if (*table & _REGION_ENTRY_INVALID) 759 if (*table & _REGION_ENTRY_INVALID)
674 return NULL; 760 return NULL;
675 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); 761 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
676 /* Fallthrough */ 762 /* Fallthrough */
677 case _ASCE_TYPE_SEGMENT: 763 case _ASCE_TYPE_SEGMENT:
678 table += (gaddr >> 20) & 0x7ff; 764 table += (gaddr >> 20) & 0x7ff;
765 if (level == 1)
766 break;
767 if (*table & _REGION_ENTRY_INVALID)
768 return NULL;
769 table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
770 table += (gaddr >> 12) & 0xff;
679 } 771 }
680 return table; 772 return table;
681} 773}
@@ -688,16 +780,27 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
688 * @ptl: pointer to the spinlock pointer 780 * @ptl: pointer to the spinlock pointer
689 * 781 *
690 * Returns a pointer to the locked pte for a guest address, or NULL 782 * Returns a pointer to the locked pte for a guest address, or NULL
783 *
784 * Note: Can also be called for shadow gmaps.
691 */ 785 */
692static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, 786static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
693 spinlock_t **ptl) 787 spinlock_t **ptl)
694{ 788{
695 unsigned long *table; 789 unsigned long *table;
696 790
791 if (gmap_is_shadow(gmap))
792 spin_lock(&gmap->guest_table_lock);
697 /* Walk the gmap page table, lock and get pte pointer */ 793 /* Walk the gmap page table, lock and get pte pointer */
698 table = gmap_table_walk(gmap, gaddr); 794 table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
699 if (!table || *table & _SEGMENT_ENTRY_INVALID) 795 if (!table || *table & _SEGMENT_ENTRY_INVALID) {
796 if (gmap_is_shadow(gmap))
797 spin_unlock(&gmap->guest_table_lock);
700 return NULL; 798 return NULL;
799 }
800 if (gmap_is_shadow(gmap)) {
801 *ptl = &gmap->guest_table_lock;
802 return pte_offset_map((pmd_t *) table, gaddr);
803 }
701 return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); 804 return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
702} 805}
703 806
@@ -717,6 +820,7 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
717 struct mm_struct *mm = gmap->mm; 820 struct mm_struct *mm = gmap->mm;
718 bool unlocked = false; 821 bool unlocked = false;
719 822
823 BUG_ON(gmap_is_shadow(gmap));
720 if (fixup_user_fault(current, mm, vmaddr, FAULT_FLAG_WRITE, &unlocked)) 824 if (fixup_user_fault(current, mm, vmaddr, FAULT_FLAG_WRITE, &unlocked))
721 return -EFAULT; 825 return -EFAULT;
722 if (unlocked) 826 if (unlocked)
@@ -735,6 +839,51 @@ static void gmap_pte_op_end(spinlock_t *ptl)
735 spin_unlock(ptl); 839 spin_unlock(ptl);
736} 840}
737 841
842/*
843 * gmap_protect_range - remove access rights to memory and set pgste bits
844 * @gmap: pointer to guest mapping meta data structure
845 * @gaddr: virtual address in the guest address space
846 * @len: size of area
847 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
848 * @bits: pgste notification bits to set
849 *
850 * Returns 0 if successfully protected, -ENOMEM if out of memory and
851 * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
852 *
853 * Called with sg->mm->mmap_sem in read.
854 *
855 * Note: Can also be called for shadow gmaps.
856 */
857static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
858 unsigned long len, int prot, unsigned long bits)
859{
860 unsigned long vmaddr;
861 spinlock_t *ptl;
862 pte_t *ptep;
863 int rc;
864
865 while (len) {
866 rc = -EAGAIN;
867 ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
868 if (ptep) {
869 rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
870 gmap_pte_op_end(ptl);
871 }
872 if (rc) {
873 vmaddr = __gmap_translate(gmap, gaddr);
874 if (IS_ERR_VALUE(vmaddr))
875 return vmaddr;
876 rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr);
877 if (rc)
878 return rc;
879 continue;
880 }
881 gaddr += PAGE_SIZE;
882 len -= PAGE_SIZE;
883 }
884 return 0;
885}
886
738/** 887/**
739 * gmap_mprotect_notify - change access rights for a range of ptes and 888 * gmap_mprotect_notify - change access rights for a range of ptes and
740 * call the notifier if any pte changes again 889 * call the notifier if any pte changes again
@@ -752,61 +901,1012 @@ static void gmap_pte_op_end(spinlock_t *ptl)
752int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, 901int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
753 unsigned long len, int prot) 902 unsigned long len, int prot)
754{ 903{
755 unsigned long vmaddr; 904 int rc;
756 spinlock_t *ptl;
757 pte_t *ptep;
758 int rc = 0;
759 905
760 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) 906 if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
761 return -EINVAL; 907 return -EINVAL;
762 if (!MACHINE_HAS_ESOP && prot == PROT_READ) 908 if (!MACHINE_HAS_ESOP && prot == PROT_READ)
763 return -EINVAL; 909 return -EINVAL;
764 down_read(&gmap->mm->mmap_sem); 910 down_read(&gmap->mm->mmap_sem);
765 while (len) { 911 rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT);
912 up_read(&gmap->mm->mmap_sem);
913 return rc;
914}
915EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
916
917/**
918 * gmap_read_table - get an unsigned long value from a guest page table using
919 * absolute addressing, without marking the page referenced.
920 * @gmap: pointer to guest mapping meta data structure
921 * @gaddr: virtual address in the guest address space
922 * @val: pointer to the unsigned long value to return
923 *
924 * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
925 * if reading using the virtual address failed.
926 *
927 * Called with gmap->mm->mmap_sem in read.
928 */
929int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
930{
931 unsigned long address, vmaddr;
932 spinlock_t *ptl;
933 pte_t *ptep, pte;
934 int rc;
935
936 while (1) {
766 rc = -EAGAIN; 937 rc = -EAGAIN;
767 ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); 938 ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
768 if (ptep) { 939 if (ptep) {
769 rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot); 940 pte = *ptep;
941 if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
942 address = pte_val(pte) & PAGE_MASK;
943 address += gaddr & ~PAGE_MASK;
944 *val = *(unsigned long *) address;
945 pte_val(*ptep) |= _PAGE_YOUNG;
946 /* Do *NOT* clear the _PAGE_INVALID bit! */
947 rc = 0;
948 }
949 gmap_pte_op_end(ptl);
950 }
951 if (!rc)
952 break;
953 vmaddr = __gmap_translate(gmap, gaddr);
954 if (IS_ERR_VALUE(vmaddr)) {
955 rc = vmaddr;
956 break;
957 }
958 rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr);
959 if (rc)
960 break;
961 }
962 return rc;
963}
964EXPORT_SYMBOL_GPL(gmap_read_table);
965
966/**
967 * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
968 * @sg: pointer to the shadow guest address space structure
969 * @vmaddr: vm address associated with the rmap
970 * @rmap: pointer to the rmap structure
971 *
972 * Called with the sg->guest_table_lock
973 */
974static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
975 struct gmap_rmap *rmap)
976{
977 void **slot;
978
979 BUG_ON(!gmap_is_shadow(sg));
980 slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
981 if (slot) {
982 rmap->next = radix_tree_deref_slot_protected(slot,
983 &sg->guest_table_lock);
984 radix_tree_replace_slot(slot, rmap);
985 } else {
986 rmap->next = NULL;
987 radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
988 rmap);
989 }
990}
991
992/**
993 * gmap_protect_rmap - modify access rights to memory and create an rmap
994 * @sg: pointer to the shadow guest address space structure
995 * @raddr: rmap address in the shadow gmap
996 * @paddr: address in the parent guest address space
997 * @len: length of the memory area to protect
998 * @prot: indicates access rights: none, read-only or read-write
999 *
1000 * Returns 0 if successfully protected and the rmap was created, -ENOMEM
1001 * if out of memory and -EFAULT if paddr is invalid.
1002 */
1003static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
1004 unsigned long paddr, unsigned long len, int prot)
1005{
1006 struct gmap *parent;
1007 struct gmap_rmap *rmap;
1008 unsigned long vmaddr;
1009 spinlock_t *ptl;
1010 pte_t *ptep;
1011 int rc;
1012
1013 BUG_ON(!gmap_is_shadow(sg));
1014 parent = sg->parent;
1015 while (len) {
1016 vmaddr = __gmap_translate(parent, paddr);
1017 if (IS_ERR_VALUE(vmaddr))
1018 return vmaddr;
1019 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
1020 if (!rmap)
1021 return -ENOMEM;
1022 rmap->raddr = raddr;
1023 rc = radix_tree_preload(GFP_KERNEL);
1024 if (rc) {
1025 kfree(rmap);
1026 return rc;
1027 }
1028 rc = -EAGAIN;
1029 ptep = gmap_pte_op_walk(parent, paddr, &ptl);
1030 if (ptep) {
1031 spin_lock(&sg->guest_table_lock);
1032 rc = ptep_force_prot(parent->mm, paddr, ptep, prot,
1033 PGSTE_VSIE_BIT);
1034 if (!rc)
1035 gmap_insert_rmap(sg, vmaddr, rmap);
1036 spin_unlock(&sg->guest_table_lock);
770 gmap_pte_op_end(ptl); 1037 gmap_pte_op_end(ptl);
771 } 1038 }
1039 radix_tree_preload_end();
772 if (rc) { 1040 if (rc) {
773 vmaddr = __gmap_translate(gmap, gaddr); 1041 kfree(rmap);
774 if (IS_ERR_VALUE(vmaddr)) { 1042 rc = gmap_pte_op_fixup(parent, paddr, vmaddr);
775 rc = vmaddr;
776 break;
777 }
778 rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr);
779 if (rc) 1043 if (rc)
780 break; 1044 return rc;
781 continue; 1045 continue;
782 } 1046 }
783 gaddr += PAGE_SIZE; 1047 paddr += PAGE_SIZE;
784 len -= PAGE_SIZE; 1048 len -= PAGE_SIZE;
785 } 1049 }
786 up_read(&gmap->mm->mmap_sem); 1050 return 0;
1051}
1052
1053#define _SHADOW_RMAP_MASK 0x7
1054#define _SHADOW_RMAP_REGION1 0x5
1055#define _SHADOW_RMAP_REGION2 0x4
1056#define _SHADOW_RMAP_REGION3 0x3
1057#define _SHADOW_RMAP_SEGMENT 0x2
1058#define _SHADOW_RMAP_PGTABLE 0x1
1059
1060/**
1061 * gmap_idte_one - invalidate a single region or segment table entry
1062 * @asce: region or segment table *origin* + table-type bits
1063 * @vaddr: virtual address to identify the table entry to flush
1064 *
1065 * The invalid bit of a single region or segment table entry is set
1066 * and the associated TLB entries depending on the entry are flushed.
1067 * The table-type of the @asce identifies the portion of the @vaddr
1068 * that is used as the invalidation index.
1069 */
1070static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
1071{
1072 asm volatile(
1073 " .insn rrf,0xb98e0000,%0,%1,0,0"
1074 : : "a" (asce), "a" (vaddr) : "cc", "memory");
1075}
1076
1077/**
1078 * gmap_unshadow_page - remove a page from a shadow page table
1079 * @sg: pointer to the shadow guest address space structure
1080 * @raddr: rmap address in the shadow guest address space
1081 *
1082 * Called with the sg->guest_table_lock
1083 */
1084static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
1085{
1086 unsigned long *table;
1087
1088 BUG_ON(!gmap_is_shadow(sg));
1089 table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
1090 if (!table || *table & _PAGE_INVALID)
1091 return;
1092 gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1);
1093 ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
1094}
1095
1096/**
1097 * __gmap_unshadow_pgt - remove all entries from a shadow page table
1098 * @sg: pointer to the shadow guest address space structure
1099 * @raddr: rmap address in the shadow guest address space
1100 * @pgt: pointer to the start of a shadow page table
1101 *
1102 * Called with the sg->guest_table_lock
1103 */
1104static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
1105 unsigned long *pgt)
1106{
1107 int i;
1108
1109 BUG_ON(!gmap_is_shadow(sg));
1110 for (i = 0; i < 256; i++, raddr += 1UL << 12)
1111 pgt[i] = _PAGE_INVALID;
1112}
1113
1114/**
1115 * gmap_unshadow_pgt - remove a shadow page table from a segment entry
1116 * @sg: pointer to the shadow guest address space structure
1117 * @raddr: address in the shadow guest address space
1118 *
1119 * Called with the sg->guest_table_lock
1120 */
1121static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
1122{
1123 unsigned long sto, *ste, *pgt;
1124 struct page *page;
1125
1126 BUG_ON(!gmap_is_shadow(sg));
1127 ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
1128 if (!ste || *ste & _SEGMENT_ENTRY_INVALID)
1129 return;
1130 gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1);
1131 sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff));
1132 gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
1133 pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
1134 *ste = _SEGMENT_ENTRY_EMPTY;
1135 __gmap_unshadow_pgt(sg, raddr, pgt);
1136 /* Free page table */
1137 page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
1138 list_del(&page->lru);
1139 page_table_free_pgste(page);
1140}
1141
1142/**
1143 * __gmap_unshadow_sgt - remove all entries from a shadow segment table
1144 * @sg: pointer to the shadow guest address space structure
1145 * @raddr: rmap address in the shadow guest address space
1146 * @sgt: pointer to the start of a shadow segment table
1147 *
1148 * Called with the sg->guest_table_lock
1149 */
1150static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
1151 unsigned long *sgt)
1152{
1153 unsigned long asce, *pgt;
1154 struct page *page;
1155 int i;
1156
1157 BUG_ON(!gmap_is_shadow(sg));
1158 asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT;
1159 for (i = 0; i < 2048; i++, raddr += 1UL << 20) {
1160 if (sgt[i] & _SEGMENT_ENTRY_INVALID)
1161 continue;
1162 pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
1163 sgt[i] = _SEGMENT_ENTRY_EMPTY;
1164 __gmap_unshadow_pgt(sg, raddr, pgt);
1165 /* Free page table */
1166 page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
1167 list_del(&page->lru);
1168 page_table_free_pgste(page);
1169 }
1170}
1171
1172/**
1173 * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
1174 * @sg: pointer to the shadow guest address space structure
1175 * @raddr: rmap address in the shadow guest address space
1176 *
1177 * Called with the shadow->guest_table_lock
1178 */
1179static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
1180{
1181 unsigned long r3o, *r3e, *sgt;
1182 struct page *page;
1183
1184 BUG_ON(!gmap_is_shadow(sg));
1185 r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
1186 if (!r3e || *r3e & _REGION_ENTRY_INVALID)
1187 return;
1188 gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1);
1189 r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff));
1190 gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
1191 sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
1192 *r3e = _REGION3_ENTRY_EMPTY;
1193 __gmap_unshadow_sgt(sg, raddr, sgt);
1194 /* Free segment table */
1195 page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
1196 list_del(&page->lru);
1197 __free_pages(page, 2);
1198}
1199
1200/**
1201 * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
1202 * @sg: pointer to the shadow guest address space structure
1203 * @raddr: address in the shadow guest address space
1204 * @r3t: pointer to the start of a shadow region-3 table
1205 *
1206 * Called with the sg->guest_table_lock
1207 */
1208static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
1209 unsigned long *r3t)
1210{
1211 unsigned long asce, *sgt;
1212 struct page *page;
1213 int i;
1214
1215 BUG_ON(!gmap_is_shadow(sg));
1216 asce = (unsigned long) r3t | _ASCE_TYPE_REGION3;
1217 for (i = 0; i < 2048; i++, raddr += 1UL << 31) {
1218 if (r3t[i] & _REGION_ENTRY_INVALID)
1219 continue;
1220 sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
1221 r3t[i] = _REGION3_ENTRY_EMPTY;
1222 __gmap_unshadow_sgt(sg, raddr, sgt);
1223 /* Free segment table */
1224 page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
1225 list_del(&page->lru);
1226 __free_pages(page, 2);
1227 }
1228}
1229
1230/**
1231 * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
1232 * @sg: pointer to the shadow guest address space structure
1233 * @raddr: rmap address in the shadow guest address space
1234 *
1235 * Called with the sg->guest_table_lock
1236 */
1237static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
1238{
1239 unsigned long r2o, *r2e, *r3t;
1240 struct page *page;
1241
1242 BUG_ON(!gmap_is_shadow(sg));
1243 r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
1244 if (!r2e || *r2e & _REGION_ENTRY_INVALID)
1245 return;
1246 gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1);
1247 r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff));
1248 gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
1249 r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
1250 *r2e = _REGION2_ENTRY_EMPTY;
1251 __gmap_unshadow_r3t(sg, raddr, r3t);
1252 /* Free region 3 table */
1253 page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
1254 list_del(&page->lru);
1255 __free_pages(page, 2);
1256}
1257
1258/**
1259 * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
1260 * @sg: pointer to the shadow guest address space structure
1261 * @raddr: rmap address in the shadow guest address space
1262 * @r2t: pointer to the start of a shadow region-2 table
1263 *
1264 * Called with the sg->guest_table_lock
1265 */
1266static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
1267 unsigned long *r2t)
1268{
1269 unsigned long asce, *r3t;
1270 struct page *page;
1271 int i;
1272
1273 BUG_ON(!gmap_is_shadow(sg));
1274 asce = (unsigned long) r2t | _ASCE_TYPE_REGION2;
1275 for (i = 0; i < 2048; i++, raddr += 1UL << 42) {
1276 if (r2t[i] & _REGION_ENTRY_INVALID)
1277 continue;
1278 r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
1279 r2t[i] = _REGION2_ENTRY_EMPTY;
1280 __gmap_unshadow_r3t(sg, raddr, r3t);
1281 /* Free region 3 table */
1282 page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
1283 list_del(&page->lru);
1284 __free_pages(page, 2);
1285 }
1286}
1287
1288/**
1289 * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
1290 * @sg: pointer to the shadow guest address space structure
1291 * @raddr: rmap address in the shadow guest address space
1292 *
1293 * Called with the sg->guest_table_lock
1294 */
1295static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
1296{
1297 unsigned long r1o, *r1e, *r2t;
1298 struct page *page;
1299
1300 BUG_ON(!gmap_is_shadow(sg));
1301 r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
1302 if (!r1e || *r1e & _REGION_ENTRY_INVALID)
1303 return;
1304 gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1);
1305 r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff));
1306 gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
1307 r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
1308 *r1e = _REGION1_ENTRY_EMPTY;
1309 __gmap_unshadow_r2t(sg, raddr, r2t);
1310 /* Free region 2 table */
1311 page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
1312 list_del(&page->lru);
1313 __free_pages(page, 2);
1314}
1315
1316/**
1317 * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
1318 * @sg: pointer to the shadow guest address space structure
1319 * @raddr: rmap address in the shadow guest address space
1320 * @r1t: pointer to the start of a shadow region-1 table
1321 *
1322 * Called with the shadow->guest_table_lock
1323 */
1324static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
1325 unsigned long *r1t)
1326{
1327 unsigned long asce, *r2t;
1328 struct page *page;
1329 int i;
1330
1331 BUG_ON(!gmap_is_shadow(sg));
1332 asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
1333 for (i = 0; i < 2048; i++, raddr += 1UL << 53) {
1334 if (r1t[i] & _REGION_ENTRY_INVALID)
1335 continue;
1336 r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
1337 __gmap_unshadow_r2t(sg, raddr, r2t);
1338 /* Clear entry and flush translation r1t -> r2t */
1339 gmap_idte_one(asce, raddr);
1340 r1t[i] = _REGION1_ENTRY_EMPTY;
1341 /* Free region 2 table */
1342 page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
1343 list_del(&page->lru);
1344 __free_pages(page, 2);
1345 }
1346}
1347
1348/**
1349 * gmap_unshadow - remove a shadow page table completely
1350 * @sg: pointer to the shadow guest address space structure
1351 *
1352 * Called with sg->guest_table_lock
1353 */
1354static void gmap_unshadow(struct gmap *sg)
1355{
1356 unsigned long *table;
1357
1358 BUG_ON(!gmap_is_shadow(sg));
1359 if (sg->removed)
1360 return;
1361 sg->removed = 1;
1362 gmap_call_notifier(sg, 0, -1UL);
1363 table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
1364 switch (sg->asce & _ASCE_TYPE_MASK) {
1365 case _ASCE_TYPE_REGION1:
1366 __gmap_unshadow_r1t(sg, 0, table);
1367 break;
1368 case _ASCE_TYPE_REGION2:
1369 __gmap_unshadow_r2t(sg, 0, table);
1370 break;
1371 case _ASCE_TYPE_REGION3:
1372 __gmap_unshadow_r3t(sg, 0, table);
1373 break;
1374 case _ASCE_TYPE_SEGMENT:
1375 __gmap_unshadow_sgt(sg, 0, table);
1376 break;
1377 }
1378}
1379
1380/**
1381 * gmap_find_shadow - find a specific asce in the list of shadow tables
1382 * @parent: pointer to the parent gmap
1383 * @asce: ASCE for which the shadow table is created
1384 *
1385 * Returns the pointer to a gmap if a shadow table with the given asce is
1386 * already available, otherwise NULL
1387 */
1388static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce)
1389{
1390 struct gmap *sg;
1391
1392 list_for_each_entry(sg, &parent->children, list) {
1393 if (sg->orig_asce != asce || sg->removed)
1394 continue;
1395 atomic_inc(&sg->ref_count);
1396 return sg;
1397 }
1398 return NULL;
1399}
1400
1401/**
1402 * gmap_shadow - create/find a shadow guest address space
1403 * @parent: pointer to the parent gmap
1404 * @asce: ASCE for which the shadow table is created
1405 *
1406 * The pages of the top level page table referred by the asce parameter
1407 * will be set to read-only and marked in the PGSTEs of the kvm process.
1408 * The shadow table will be removed automatically on any change to the
1409 * PTE mapping for the source table.
1410 *
1411 * Returns a guest address space structure, NULL if out of memory or if
1412 * anything goes wrong while protecting the top level pages.
1413 */
1414struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce)
1415{
1416 struct gmap *sg, *new;
1417 unsigned long limit;
1418 int rc;
1419
1420 BUG_ON(gmap_is_shadow(parent));
1421 spin_lock(&parent->shadow_lock);
1422 sg = gmap_find_shadow(parent, asce);
1423 spin_unlock(&parent->shadow_lock);
1424 if (sg)
1425 return sg;
1426 /* Create a new shadow gmap */
1427 limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
1428 new = gmap_alloc(limit);
1429 if (!new)
1430 return NULL;
1431 new->mm = parent->mm;
1432 new->parent = gmap_get(parent);
1433 new->orig_asce = asce;
1434 down_read(&parent->mm->mmap_sem);
1435 rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
1436 ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096,
1437 PROT_READ, PGSTE_VSIE_BIT);
1438 up_read(&parent->mm->mmap_sem);
1439 if (rc) {
1440 atomic_set(&new->ref_count, 2);
1441 spin_lock(&parent->shadow_lock);
1442 /* Recheck if another CPU created the same shadow */
1443 sg = gmap_find_shadow(parent, asce);
1444 if (!sg) {
1445 list_add(&new->list, &parent->children);
1446 sg = new;
1447 new = NULL;
1448 }
1449 spin_unlock(&parent->shadow_lock);
1450 }
1451 if (new)
1452 gmap_free(new);
1453 return sg;
1454}
1455EXPORT_SYMBOL_GPL(gmap_shadow);
1456
1457/**
1458 * gmap_shadow_r2t - create an empty shadow region 2 table
1459 * @sg: pointer to the shadow guest address space structure
1460 * @saddr: faulting address in the shadow gmap
1461 * @r2t: parent gmap address of the region 2 table to get shadowed
1462 *
1463 * The r2t parameter specifies the address of the source table. The
1464 * four pages of the source table are made read-only in the parent gmap
1465 * address space. A write to the source table area @r2t will automatically
1466 * remove the shadow r2 table and all of its decendents.
1467 *
1468 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1469 * shadow table structure is incomplete, -ENOMEM if out of memory and
1470 * -EFAULT if an address in the parent gmap could not be resolved.
1471 *
1472 * Called with sg->mm->mmap_sem in read.
1473 */
1474int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t)
1475{
1476 unsigned long raddr, origin, offset, len;
1477 unsigned long *s_r2t, *table;
1478 struct page *page;
1479 int rc;
1480
1481 BUG_ON(!gmap_is_shadow(sg));
1482 /* Allocate a shadow region second table */
1483 page = alloc_pages(GFP_KERNEL, 2);
1484 if (!page)
1485 return -ENOMEM;
1486 page->index = r2t & _REGION_ENTRY_ORIGIN;
1487 s_r2t = (unsigned long *) page_to_phys(page);
1488 /* Install shadow region second table */
1489 spin_lock(&sg->guest_table_lock);
1490 table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
1491 if (!table) {
1492 rc = -EAGAIN; /* Race with unshadow */
1493 goto out_free;
1494 }
1495 if (!(*table & _REGION_ENTRY_INVALID)) {
1496 rc = 0; /* Already established */
1497 goto out_free;
1498 }
1499 crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
1500 *table = (unsigned long) s_r2t |
1501 _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R1;
1502 list_add(&page->lru, &sg->crst_list);
1503 spin_unlock(&sg->guest_table_lock);
1504 /* Make r2t read-only in parent gmap page table */
1505 raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1;
1506 origin = r2t & _REGION_ENTRY_ORIGIN;
1507 offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
1508 len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
1509 rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
1510 if (rc) {
1511 spin_lock(&sg->guest_table_lock);
1512 gmap_unshadow_r2t(sg, raddr);
1513 spin_unlock(&sg->guest_table_lock);
1514 }
1515 return rc;
1516out_free:
1517 spin_unlock(&sg->guest_table_lock);
1518 __free_pages(page, 2);
787 return rc; 1519 return rc;
788} 1520}
789EXPORT_SYMBOL_GPL(gmap_mprotect_notify); 1521EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
1522
1523/**
1524 * gmap_shadow_r3t - create a shadow region 3 table
1525 * @sg: pointer to the shadow guest address space structure
1526 * @saddr: faulting address in the shadow gmap
1527 * @r3t: parent gmap address of the region 3 table to get shadowed
1528 *
1529 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1530 * shadow table structure is incomplete, -ENOMEM if out of memory and
1531 * -EFAULT if an address in the parent gmap could not be resolved.
1532 *
1533 * Called with sg->mm->mmap_sem in read.
1534 */
1535int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t)
1536{
1537 unsigned long raddr, origin, offset, len;
1538 unsigned long *s_r3t, *table;
1539 struct page *page;
1540 int rc;
1541
1542 BUG_ON(!gmap_is_shadow(sg));
1543 /* Allocate a shadow region second table */
1544 page = alloc_pages(GFP_KERNEL, 2);
1545 if (!page)
1546 return -ENOMEM;
1547 page->index = r3t & _REGION_ENTRY_ORIGIN;
1548 s_r3t = (unsigned long *) page_to_phys(page);
1549 /* Install shadow region second table */
1550 spin_lock(&sg->guest_table_lock);
1551 table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
1552 if (!table) {
1553 rc = -EAGAIN; /* Race with unshadow */
1554 goto out_free;
1555 }
1556 if (!(*table & _REGION_ENTRY_INVALID)) {
1557 rc = 0; /* Already established */
1558 goto out_free;
1559 }
1560 crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
1561 *table = (unsigned long) s_r3t |
1562 _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R2;
1563 list_add(&page->lru, &sg->crst_list);
1564 spin_unlock(&sg->guest_table_lock);
1565 /* Make r3t read-only in parent gmap page table */
1566 raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2;
1567 origin = r3t & _REGION_ENTRY_ORIGIN;
1568 offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
1569 len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
1570 rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
1571 if (rc) {
1572 spin_lock(&sg->guest_table_lock);
1573 gmap_unshadow_r3t(sg, raddr);
1574 spin_unlock(&sg->guest_table_lock);
1575 }
1576 return rc;
1577out_free:
1578 spin_unlock(&sg->guest_table_lock);
1579 __free_pages(page, 2);
1580 return rc;
1581}
1582EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
1583
1584/**
1585 * gmap_shadow_sgt - create a shadow segment table
1586 * @sg: pointer to the shadow guest address space structure
1587 * @saddr: faulting address in the shadow gmap
1588 * @sgt: parent gmap address of the segment table to get shadowed
1589 *
1590 * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
1591 * shadow table structure is incomplete, -ENOMEM if out of memory and
1592 * -EFAULT if an address in the parent gmap could not be resolved.
1593 *
1594 * Called with sg->mm->mmap_sem in read.
1595 */
1596int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt)
1597{
1598 unsigned long raddr, origin, offset, len;
1599 unsigned long *s_sgt, *table;
1600 struct page *page;
1601 int rc;
1602
1603 BUG_ON(!gmap_is_shadow(sg));
1604 /* Allocate a shadow segment table */
1605 page = alloc_pages(GFP_KERNEL, 2);
1606 if (!page)
1607 return -ENOMEM;
1608 page->index = sgt & _REGION_ENTRY_ORIGIN;
1609 s_sgt = (unsigned long *) page_to_phys(page);
1610 /* Install shadow region second table */
1611 spin_lock(&sg->guest_table_lock);
1612 table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
1613 if (!table) {
1614 rc = -EAGAIN; /* Race with unshadow */
1615 goto out_free;
1616 }
1617 if (!(*table & _REGION_ENTRY_INVALID)) {
1618 rc = 0; /* Already established */
1619 goto out_free;
1620 }
1621 crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
1622 *table = (unsigned long) s_sgt |
1623 _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R3;
1624 list_add(&page->lru, &sg->crst_list);
1625 spin_unlock(&sg->guest_table_lock);
1626 /* Make sgt read-only in parent gmap page table */
1627 raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3;
1628 origin = sgt & _REGION_ENTRY_ORIGIN;
1629 offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096;
1630 len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
1631 rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
1632 if (rc) {
1633 spin_lock(&sg->guest_table_lock);
1634 gmap_unshadow_sgt(sg, raddr);
1635 spin_unlock(&sg->guest_table_lock);
1636 }
1637 return rc;
1638out_free:
1639 spin_unlock(&sg->guest_table_lock);
1640 __free_pages(page, 2);
1641 return rc;
1642}
1643EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
1644
1645/**
1646 * gmap_shadow_lookup_pgtable - find a shadow page table
1647 * @sg: pointer to the shadow guest address space structure
1648 * @saddr: the address in the shadow aguest address space
1649 * @pgt: parent gmap address of the page table to get shadowed
1650 * @dat_protection: if the pgtable is marked as protected by dat
1651 *
1652 * Returns 0 if the shadow page table was found and -EAGAIN if the page
1653 * table was not found.
1654 *
1655 * Called with sg->mm->mmap_sem in read.
1656 */
1657int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
1658 unsigned long *pgt, int *dat_protection)
1659{
1660 unsigned long *table;
1661 struct page *page;
1662 int rc;
1663
1664 BUG_ON(!gmap_is_shadow(sg));
1665 spin_lock(&sg->guest_table_lock);
1666 table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
1667 if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
1668 /* Shadow page tables are full pages (pte+pgste) */
1669 page = pfn_to_page(*table >> PAGE_SHIFT);
1670 *pgt = page->index;
1671 *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
1672 rc = 0;
1673 } else {
1674 rc = -EAGAIN;
1675 }
1676 spin_unlock(&sg->guest_table_lock);
1677 return rc;
1678
1679}
1680EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
1681
1682/**
1683 * gmap_shadow_pgt - instantiate a shadow page table
1684 * @sg: pointer to the shadow guest address space structure
1685 * @saddr: faulting address in the shadow gmap
1686 * @pgt: parent gmap address of the page table to get shadowed
1687 *
1688 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1689 * shadow table structure is incomplete, -ENOMEM if out of memory,
1690 * -EFAULT if an address in the parent gmap could not be resolved and
1691 *
1692 * Called with gmap->mm->mmap_sem in read
1693 */
1694int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt)
1695{
1696 unsigned long raddr, origin;
1697 unsigned long *s_pgt, *table;
1698 struct page *page;
1699 int rc;
1700
1701 BUG_ON(!gmap_is_shadow(sg));
1702 /* Allocate a shadow page table */
1703 page = page_table_alloc_pgste(sg->mm);
1704 if (!page)
1705 return -ENOMEM;
1706 page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
1707 s_pgt = (unsigned long *) page_to_phys(page);
1708 /* Install shadow page table */
1709 spin_lock(&sg->guest_table_lock);
1710 table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
1711 if (!table) {
1712 rc = -EAGAIN; /* Race with unshadow */
1713 goto out_free;
1714 }
1715 if (!(*table & _SEGMENT_ENTRY_INVALID)) {
1716 rc = 0; /* Already established */
1717 goto out_free;
1718 }
1719 *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
1720 (pgt & _SEGMENT_ENTRY_PROTECT);
1721 list_add(&page->lru, &sg->pt_list);
1722 spin_unlock(&sg->guest_table_lock);
1723 /* Make pgt read-only in parent gmap page table (not the pgste) */
1724 raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT;
1725 origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
1726 rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ);
1727 if (rc) {
1728 spin_lock(&sg->guest_table_lock);
1729 gmap_unshadow_pgt(sg, raddr);
1730 spin_unlock(&sg->guest_table_lock);
1731 }
1732 return rc;
1733out_free:
1734 spin_unlock(&sg->guest_table_lock);
1735 page_table_free_pgste(page);
1736 return rc;
1737
1738}
1739EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
1740
1741/**
1742 * gmap_shadow_page - create a shadow page mapping
1743 * @sg: pointer to the shadow guest address space structure
1744 * @saddr: faulting address in the shadow gmap
1745 * @paddr: parent gmap address to get mapped at @saddr
1746 * @write: =1 map r/w, =0 map r/o
1747 *
1748 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1749 * shadow table structure is incomplete, -ENOMEM if out of memory and
1750 * -EFAULT if an address in the parent gmap could not be resolved.
1751 *
1752 * Called with sg->mm->mmap_sem in read.
1753 */
1754int gmap_shadow_page(struct gmap *sg, unsigned long saddr,
1755 unsigned long paddr, int write)
1756{
1757 struct gmap *parent;
1758 struct gmap_rmap *rmap;
1759 unsigned long vmaddr;
1760 spinlock_t *ptl;
1761 pte_t *sptep, *tptep;
1762 int rc;
1763
1764 BUG_ON(!gmap_is_shadow(sg));
1765 parent = sg->parent;
1766
1767 rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
1768 if (!rmap)
1769 return -ENOMEM;
1770 rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
1771
1772 while (1) {
1773 vmaddr = __gmap_translate(parent, paddr);
1774 if (IS_ERR_VALUE(vmaddr)) {
1775 rc = vmaddr;
1776 break;
1777 }
1778 rc = radix_tree_preload(GFP_KERNEL);
1779 if (rc)
1780 break;
1781 rc = -EAGAIN;
1782 sptep = gmap_pte_op_walk(parent, paddr, &ptl);
1783 if (sptep) {
1784 spin_lock(&sg->guest_table_lock);
1785 /* Get page table pointer */
1786 tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
1787 if (!tptep) {
1788 spin_unlock(&sg->guest_table_lock);
1789 gmap_pte_op_end(ptl);
1790 radix_tree_preload_end();
1791 break;
1792 }
1793 rc = ptep_shadow_pte(sg->mm, saddr,
1794 sptep, tptep, write);
1795 if (rc > 0) {
1796 /* Success and a new mapping */
1797 gmap_insert_rmap(sg, vmaddr, rmap);
1798 rmap = NULL;
1799 rc = 0;
1800 }
1801 gmap_pte_op_end(ptl);
1802 spin_unlock(&sg->guest_table_lock);
1803 }
1804 radix_tree_preload_end();
1805 if (!rc)
1806 break;
1807 rc = gmap_pte_op_fixup(parent, paddr, vmaddr);
1808 if (rc)
1809 break;
1810 }
1811 kfree(rmap);
1812 return rc;
1813}
1814EXPORT_SYMBOL_GPL(gmap_shadow_page);
1815
1816/**
1817 * gmap_shadow_notify - handle notifications for shadow gmap
1818 *
1819 * Called with sg->parent->shadow_lock.
1820 */
1821static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
1822 unsigned long offset, pte_t *pte)
1823{
1824 struct gmap_rmap *rmap, *rnext, *head;
1825 unsigned long gaddr, start, end, bits, raddr;
1826 unsigned long *table;
1827
1828 BUG_ON(!gmap_is_shadow(sg));
1829 spin_lock(&sg->parent->guest_table_lock);
1830 table = radix_tree_lookup(&sg->parent->host_to_guest,
1831 vmaddr >> PMD_SHIFT);
1832 gaddr = table ? __gmap_segment_gaddr(table) + offset : 0;
1833 spin_unlock(&sg->parent->guest_table_lock);
1834 if (!table)
1835 return;
1836
1837 spin_lock(&sg->guest_table_lock);
1838 if (sg->removed) {
1839 spin_unlock(&sg->guest_table_lock);
1840 return;
1841 }
1842 /* Check for top level table */
1843 start = sg->orig_asce & _ASCE_ORIGIN;
1844 end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096;
1845 if (gaddr >= start && gaddr < end) {
1846 /* The complete shadow table has to go */
1847 gmap_unshadow(sg);
1848 spin_unlock(&sg->guest_table_lock);
1849 list_del(&sg->list);
1850 gmap_put(sg);
1851 return;
1852 }
1853 /* Remove the page table tree from on specific entry */
1854 head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12);
1855 gmap_for_each_rmap_safe(rmap, rnext, head) {
1856 bits = rmap->raddr & _SHADOW_RMAP_MASK;
1857 raddr = rmap->raddr ^ bits;
1858 switch (bits) {
1859 case _SHADOW_RMAP_REGION1:
1860 gmap_unshadow_r2t(sg, raddr);
1861 break;
1862 case _SHADOW_RMAP_REGION2:
1863 gmap_unshadow_r3t(sg, raddr);
1864 break;
1865 case _SHADOW_RMAP_REGION3:
1866 gmap_unshadow_sgt(sg, raddr);
1867 break;
1868 case _SHADOW_RMAP_SEGMENT:
1869 gmap_unshadow_pgt(sg, raddr);
1870 break;
1871 case _SHADOW_RMAP_PGTABLE:
1872 gmap_unshadow_page(sg, raddr);
1873 break;
1874 }
1875 kfree(rmap);
1876 }
1877 spin_unlock(&sg->guest_table_lock);
1878}
790 1879
791/** 1880/**
792 * ptep_notify - call all invalidation callbacks for a specific pte. 1881 * ptep_notify - call all invalidation callbacks for a specific pte.
793 * @mm: pointer to the process mm_struct 1882 * @mm: pointer to the process mm_struct
794 * @addr: virtual address in the process address space 1883 * @addr: virtual address in the process address space
795 * @pte: pointer to the page table entry 1884 * @pte: pointer to the page table entry
1885 * @bits: bits from the pgste that caused the notify call
796 * 1886 *
797 * This function is assumed to be called with the page table lock held 1887 * This function is assumed to be called with the page table lock held
798 * for the pte to notify. 1888 * for the pte to notify.
799 */ 1889 */
800void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) 1890void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
1891 pte_t *pte, unsigned long bits)
801{ 1892{
802 unsigned long offset, gaddr; 1893 unsigned long offset, gaddr;
803 unsigned long *table; 1894 unsigned long *table;
804 struct gmap *gmap; 1895 struct gmap *gmap, *sg, *next;
805 1896
806 offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); 1897 offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
807 offset = offset * (4096 / sizeof(pte_t)); 1898 offset = offset * (4096 / sizeof(pte_t));
808 rcu_read_lock(); 1899 rcu_read_lock();
809 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 1900 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
1901 if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
1902 spin_lock(&gmap->shadow_lock);
1903 list_for_each_entry_safe(sg, next,
1904 &gmap->children, list)
1905 gmap_shadow_notify(sg, vmaddr, offset, pte);
1906 spin_unlock(&gmap->shadow_lock);
1907 }
1908 if (!(bits & PGSTE_IN_BIT))
1909 continue;
810 spin_lock(&gmap->guest_table_lock); 1910 spin_lock(&gmap->guest_table_lock);
811 table = radix_tree_lookup(&gmap->host_to_guest, 1911 table = radix_tree_lookup(&gmap->host_to_guest,
812 vmaddr >> PMD_SHIFT); 1912 vmaddr >> PMD_SHIFT);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 7be1f94f70a8..9c57a295a045 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -137,6 +137,29 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
137 return new; 137 return new;
138} 138}
139 139
140#ifdef CONFIG_PGSTE
141
142struct page *page_table_alloc_pgste(struct mm_struct *mm)
143{
144 struct page *page;
145 unsigned long *table;
146
147 page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
148 if (page) {
149 table = (unsigned long *) page_to_phys(page);
150 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
151 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
152 }
153 return page;
154}
155
156void page_table_free_pgste(struct page *page)
157{
158 __free_page(page);
159}
160
161#endif /* CONFIG_PGSTE */
162
140/* 163/*
141 * page table entry allocation/free routines. 164 * page table entry allocation/free routines.
142 */ 165 */
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index ab65fb11e058..5b02583fbf4c 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -184,9 +184,12 @@ static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
184 pte_t *ptep, pgste_t pgste) 184 pte_t *ptep, pgste_t pgste)
185{ 185{
186#ifdef CONFIG_PGSTE 186#ifdef CONFIG_PGSTE
187 if (pgste_val(pgste) & PGSTE_IN_BIT) { 187 unsigned long bits;
188 pgste_val(pgste) &= ~PGSTE_IN_BIT; 188
189 ptep_notify(mm, addr, ptep); 189 bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
190 if (bits) {
191 pgste_val(pgste) ^= bits;
192 ptep_notify(mm, addr, ptep, bits);
190 } 193 }
191#endif 194#endif
192 return pgste; 195 return pgste;
@@ -420,12 +423,13 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
420 * @addr: virtual address in the guest address space 423 * @addr: virtual address in the guest address space
421 * @ptep: pointer to the page table entry 424 * @ptep: pointer to the page table entry
422 * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE 425 * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
426 * @bit: pgste bit to set (e.g. for notification)
423 * 427 *
424 * Returns 0 if the access rights were changed and -EAGAIN if the current 428 * Returns 0 if the access rights were changed and -EAGAIN if the current
425 * and requested access rights are incompatible. 429 * and requested access rights are incompatible.
426 */ 430 */
427int ptep_force_prot(struct mm_struct *mm, unsigned long addr, 431int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
428 pte_t *ptep, int prot) 432 pte_t *ptep, int prot, unsigned long bit)
429{ 433{
430 pte_t entry; 434 pte_t entry;
431 pgste_t pgste; 435 pgste_t pgste;
@@ -441,7 +445,7 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
441 pgste_set_unlock(ptep, pgste); 445 pgste_set_unlock(ptep, pgste);
442 return -EAGAIN; 446 return -EAGAIN;
443 } 447 }
444 /* Change access rights and set the pgste notification bit */ 448 /* Change access rights and set pgste bit */
445 if (prot == PROT_NONE && !pte_i) { 449 if (prot == PROT_NONE && !pte_i) {
446 ptep_flush_direct(mm, addr, ptep); 450 ptep_flush_direct(mm, addr, ptep);
447 pgste = pgste_update_all(entry, pgste, mm); 451 pgste = pgste_update_all(entry, pgste, mm);
@@ -452,12 +456,53 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
452 pte_val(entry) &= ~_PAGE_INVALID; 456 pte_val(entry) &= ~_PAGE_INVALID;
453 pte_val(entry) |= _PAGE_PROTECT; 457 pte_val(entry) |= _PAGE_PROTECT;
454 } 458 }
455 pgste_val(pgste) |= PGSTE_IN_BIT; 459 pgste_val(pgste) |= bit;
456 pgste = pgste_set_pte(ptep, pgste, entry); 460 pgste = pgste_set_pte(ptep, pgste, entry);
457 pgste_set_unlock(ptep, pgste); 461 pgste_set_unlock(ptep, pgste);
458 return 0; 462 return 0;
459} 463}
460 464
465int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
466 pte_t *sptep, pte_t *tptep, int write)
467{
468 pgste_t spgste, tpgste;
469 pte_t spte, tpte;
470 int rc = -EAGAIN;
471
472 spgste = pgste_get_lock(sptep);
473 spte = *sptep;
474 if (!(pte_val(spte) & _PAGE_INVALID) &&
475 !(pte_val(spte) & _PAGE_PROTECT)) {
476 rc = 0;
477 if (!(pte_val(*tptep) & _PAGE_INVALID))
478 /* Update existing mapping */
479 ptep_flush_direct(mm, saddr, tptep);
480 else
481 rc = 1;
482 pgste_val(spgste) |= PGSTE_VSIE_BIT;
483 tpgste = pgste_get_lock(tptep);
484 pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
485 (write ? 0 : _PAGE_PROTECT);
486 /* don't touch the storage key - it belongs to parent pgste */
487 tpgste = pgste_set_pte(tptep, tpgste, tpte);
488 pgste_set_unlock(tptep, tpgste);
489 }
490 pgste_set_unlock(sptep, spgste);
491 return rc;
492}
493
494void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
495{
496 pgste_t pgste;
497
498 pgste = pgste_get_lock(ptep);
499 /* notifier is called by the caller */
500 ptep_flush_direct(mm, saddr, ptep);
501 /* don't touch the storage key - it belongs to parent pgste */
502 pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
503 pgste_set_unlock(ptep, pgste);
504}
505
461static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) 506static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
462{ 507{
463 if (!non_swap_entry(entry)) 508 if (!non_swap_entry(entry))