aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorKonstantin Weitz <konstantin.weitz@gmail.com>2013-04-17 11:36:29 -0400
committerMartin Schwidefsky <schwidefsky@de.ibm.com>2014-02-21 02:50:19 -0500
commitb31288fa83b2bcc8834e1e208e9526b8bd5ce361 (patch)
tree97b2372c7eeb7d7e792042e91f6489900aa38dfc /arch
parent45961722f8e30ceab9d135b1ddc0947d53aef7c3 (diff)
s390/kvm: support collaborative memory management
This patch enables Collaborative Memory Management (CMM) for kvm on s390. CMM allows the guest to inform the host about page usage (see arch/s390/mm/cmm.c). The host uses this information to avoid swapping in unused pages in the page fault handler. Further, a CPU provided list of unused invalid pages is processed to reclaim swap space of not yet accessed unused pages. [ Martin Schwidefsky: patch reordering and cleanup ] Signed-off-by: Konstantin Weitz <konstantin.weitz@gmail.com> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Diffstat (limited to 'arch')
-rw-r--r--arch/s390/include/asm/kvm_host.h5
-rw-r--r--arch/s390/include/asm/pgtable.h26
-rw-r--r--arch/s390/kvm/kvm-s390.c25
-rw-r--r--arch/s390/kvm/kvm-s390.h2
-rw-r--r--arch/s390/kvm/priv.c41
-rw-r--r--arch/s390/mm/pgtable.c77
6 files changed, 175 insertions, 1 deletions
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index eef3dd3fd9a9..9bf95bb30f1a 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -106,7 +106,9 @@ struct kvm_s390_sie_block {
106 __u64 gbea; /* 0x0180 */ 106 __u64 gbea; /* 0x0180 */
107 __u8 reserved188[24]; /* 0x0188 */ 107 __u8 reserved188[24]; /* 0x0188 */
108 __u32 fac; /* 0x01a0 */ 108 __u32 fac; /* 0x01a0 */
109 __u8 reserved1a4[68]; /* 0x01a4 */ 109 __u8 reserved1a4[20]; /* 0x01a4 */
110 __u64 cbrlo; /* 0x01b8 */
111 __u8 reserved1c0[40]; /* 0x01c0 */
110 __u64 itdba; /* 0x01e8 */ 112 __u64 itdba; /* 0x01e8 */
111 __u8 reserved1f0[16]; /* 0x01f0 */ 113 __u8 reserved1f0[16]; /* 0x01f0 */
112} __attribute__((packed)); 114} __attribute__((packed));
@@ -155,6 +157,7 @@ struct kvm_vcpu_stat {
155 u32 instruction_stsi; 157 u32 instruction_stsi;
156 u32 instruction_stfl; 158 u32 instruction_stfl;
157 u32 instruction_tprot; 159 u32 instruction_tprot;
160 u32 instruction_essa;
158 u32 instruction_sigp_sense; 161 u32 instruction_sigp_sense;
159 u32 instruction_sigp_sense_running; 162 u32 instruction_sigp_sense_running;
160 u32 instruction_sigp_external_call; 163 u32 instruction_sigp_external_call;
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index fc4bb82a0739..a7dd672c97f8 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -229,6 +229,7 @@ extern unsigned long MODULES_END;
229#define _PAGE_READ 0x010 /* SW pte read bit */ 229#define _PAGE_READ 0x010 /* SW pte read bit */
230#define _PAGE_WRITE 0x020 /* SW pte write bit */ 230#define _PAGE_WRITE 0x020 /* SW pte write bit */
231#define _PAGE_SPECIAL 0x040 /* SW associated with special page */ 231#define _PAGE_SPECIAL 0x040 /* SW associated with special page */
232#define _PAGE_UNUSED 0x080 /* SW bit for pgste usage state */
232#define __HAVE_ARCH_PTE_SPECIAL 233#define __HAVE_ARCH_PTE_SPECIAL
233 234
234/* Set of bits not changed in pte_modify */ 235/* Set of bits not changed in pte_modify */
@@ -394,6 +395,12 @@ extern unsigned long MODULES_END;
394 395
395#endif /* CONFIG_64BIT */ 396#endif /* CONFIG_64BIT */
396 397
398/* Guest Page State used for virtualization */
399#define _PGSTE_GPS_ZERO 0x0000000080000000UL
400#define _PGSTE_GPS_USAGE_MASK 0x0000000003000000UL
401#define _PGSTE_GPS_USAGE_STABLE 0x0000000000000000UL
402#define _PGSTE_GPS_USAGE_UNUSED 0x0000000001000000UL
403
397/* 404/*
398 * A user page table pointer has the space-switch-event bit, the 405 * A user page table pointer has the space-switch-event bit, the
399 * private-space-control bit and the storage-alteration-event-control 406 * private-space-control bit and the storage-alteration-event-control
@@ -617,6 +624,14 @@ static inline int pte_none(pte_t pte)
617 return pte_val(pte) == _PAGE_INVALID; 624 return pte_val(pte) == _PAGE_INVALID;
618} 625}
619 626
627static inline int pte_swap(pte_t pte)
628{
629 /* Bit pattern: (pte & 0x603) == 0x402 */
630 return (pte_val(pte) & (_PAGE_INVALID | _PAGE_PROTECT |
631 _PAGE_TYPE | _PAGE_PRESENT))
632 == (_PAGE_INVALID | _PAGE_TYPE);
633}
634
620static inline int pte_file(pte_t pte) 635static inline int pte_file(pte_t pte)
621{ 636{
622 /* Bit pattern: (pte & 0x601) == 0x600 */ 637 /* Bit pattern: (pte & 0x601) == 0x600 */
@@ -821,6 +836,7 @@ unsigned long gmap_translate(unsigned long address, struct gmap *);
821unsigned long __gmap_fault(unsigned long address, struct gmap *); 836unsigned long __gmap_fault(unsigned long address, struct gmap *);
822unsigned long gmap_fault(unsigned long address, struct gmap *); 837unsigned long gmap_fault(unsigned long address, struct gmap *);
823void gmap_discard(unsigned long from, unsigned long to, struct gmap *); 838void gmap_discard(unsigned long from, unsigned long to, struct gmap *);
839void __gmap_zap(unsigned long address, struct gmap *);
824 840
825void gmap_register_ipte_notifier(struct gmap_notifier *); 841void gmap_register_ipte_notifier(struct gmap_notifier *);
826void gmap_unregister_ipte_notifier(struct gmap_notifier *); 842void gmap_unregister_ipte_notifier(struct gmap_notifier *);
@@ -852,6 +868,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
852 868
853 if (mm_has_pgste(mm)) { 869 if (mm_has_pgste(mm)) {
854 pgste = pgste_get_lock(ptep); 870 pgste = pgste_get_lock(ptep);
871 pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
855 pgste_set_key(ptep, pgste, entry); 872 pgste_set_key(ptep, pgste, entry);
856 pgste_set_pte(ptep, entry); 873 pgste_set_pte(ptep, entry);
857 pgste_set_unlock(ptep, pgste); 874 pgste_set_unlock(ptep, pgste);
@@ -881,6 +898,12 @@ static inline int pte_young(pte_t pte)
881 return (pte_val(pte) & _PAGE_YOUNG) != 0; 898 return (pte_val(pte) & _PAGE_YOUNG) != 0;
882} 899}
883 900
901#define __HAVE_ARCH_PTE_UNUSED
902static inline int pte_unused(pte_t pte)
903{
904 return pte_val(pte) & _PAGE_UNUSED;
905}
906
884/* 907/*
885 * pgd/pmd/pte modification functions 908 * pgd/pmd/pte modification functions
886 */ 909 */
@@ -1196,6 +1219,9 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
1196 pte_val(*ptep) = _PAGE_INVALID; 1219 pte_val(*ptep) = _PAGE_INVALID;
1197 1220
1198 if (mm_has_pgste(vma->vm_mm)) { 1221 if (mm_has_pgste(vma->vm_mm)) {
1222 if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
1223 _PGSTE_GPS_USAGE_UNUSED)
1224 pte_val(pte) |= _PAGE_UNUSED;
1199 pgste = pgste_update_all(&pte, pgste); 1225 pgste = pgste_update_all(&pte, pgste);
1200 pgste_set_unlock(ptep, pgste); 1226 pgste_set_unlock(ptep, pgste);
1201 } 1227 }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e0676f390d57..10b5db3c9bc4 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -68,6 +68,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
68 { "instruction_storage_key", VCPU_STAT(instruction_storage_key) }, 68 { "instruction_storage_key", VCPU_STAT(instruction_storage_key) },
69 { "instruction_stsch", VCPU_STAT(instruction_stsch) }, 69 { "instruction_stsch", VCPU_STAT(instruction_stsch) },
70 { "instruction_chsc", VCPU_STAT(instruction_chsc) }, 70 { "instruction_chsc", VCPU_STAT(instruction_chsc) },
71 { "instruction_essa", VCPU_STAT(instruction_essa) },
71 { "instruction_stsi", VCPU_STAT(instruction_stsi) }, 72 { "instruction_stsi", VCPU_STAT(instruction_stsi) },
72 { "instruction_stfl", VCPU_STAT(instruction_stfl) }, 73 { "instruction_stfl", VCPU_STAT(instruction_stfl) },
73 { "instruction_tprot", VCPU_STAT(instruction_tprot) }, 74 { "instruction_tprot", VCPU_STAT(instruction_tprot) },
@@ -283,7 +284,11 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
283 if (kvm_is_ucontrol(vcpu->kvm)) 284 if (kvm_is_ucontrol(vcpu->kvm))
284 gmap_free(vcpu->arch.gmap); 285 gmap_free(vcpu->arch.gmap);
285 286
287 if (vcpu->arch.sie_block->cbrlo)
288 __free_page(__pfn_to_page(
289 vcpu->arch.sie_block->cbrlo >> PAGE_SHIFT));
286 free_page((unsigned long)(vcpu->arch.sie_block)); 290 free_page((unsigned long)(vcpu->arch.sie_block));
291
287 kvm_vcpu_uninit(vcpu); 292 kvm_vcpu_uninit(vcpu);
288 kmem_cache_free(kvm_vcpu_cache, vcpu); 293 kmem_cache_free(kvm_vcpu_cache, vcpu);
289} 294}
@@ -390,6 +395,8 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
390 395
391int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 396int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
392{ 397{
398 struct page *cbrl;
399
393 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | 400 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
394 CPUSTAT_SM | 401 CPUSTAT_SM |
395 CPUSTAT_STOPPED | 402 CPUSTAT_STOPPED |
@@ -401,6 +408,14 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
401 vcpu->arch.sie_block->ecb2 = 8; 408 vcpu->arch.sie_block->ecb2 = 8;
402 vcpu->arch.sie_block->eca = 0xC1002001U; 409 vcpu->arch.sie_block->eca = 0xC1002001U;
403 vcpu->arch.sie_block->fac = (int) (long) vfacilities; 410 vcpu->arch.sie_block->fac = (int) (long) vfacilities;
411 if (kvm_enabled_cmma()) {
412 cbrl = alloc_page(GFP_KERNEL | __GFP_ZERO);
413 if (cbrl) {
414 vcpu->arch.sie_block->ecb2 |= 0x80;
415 vcpu->arch.sie_block->ecb2 &= ~0x08;
416 vcpu->arch.sie_block->cbrlo = page_to_phys(cbrl);
417 }
418 }
404 hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 419 hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
405 tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet, 420 tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet,
406 (unsigned long) vcpu); 421 (unsigned long) vcpu);
@@ -761,6 +776,16 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
761 return rc; 776 return rc;
762} 777}
763 778
779bool kvm_enabled_cmma(void)
780{
781 if (!MACHINE_IS_LPAR)
782 return false;
783 /* only enable for z10 and later */
784 if (!MACHINE_HAS_EDAT1)
785 return false;
786 return true;
787}
788
764static int __vcpu_run(struct kvm_vcpu *vcpu) 789static int __vcpu_run(struct kvm_vcpu *vcpu)
765{ 790{
766 int rc, exit_reason; 791 int rc, exit_reason;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index f9559b0bd620..564514f410f4 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -156,6 +156,8 @@ void s390_vcpu_block(struct kvm_vcpu *vcpu);
156void s390_vcpu_unblock(struct kvm_vcpu *vcpu); 156void s390_vcpu_unblock(struct kvm_vcpu *vcpu);
157void exit_sie(struct kvm_vcpu *vcpu); 157void exit_sie(struct kvm_vcpu *vcpu);
158void exit_sie_sync(struct kvm_vcpu *vcpu); 158void exit_sie_sync(struct kvm_vcpu *vcpu);
159/* are we going to support cmma? */
160bool kvm_enabled_cmma(void);
159/* implemented in diag.c */ 161/* implemented in diag.c */
160int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); 162int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
161 163
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 75beea632a10..aacb6b129914 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -636,8 +636,49 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
636 return 0; 636 return 0;
637} 637}
638 638
639static int handle_essa(struct kvm_vcpu *vcpu)
640{
641 /* entries expected to be 1FF */
642 int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
643 unsigned long *cbrlo, cbrle;
644 struct gmap *gmap;
645 int i;
646
647 VCPU_EVENT(vcpu, 5, "cmma release %d pages", entries);
648 gmap = vcpu->arch.gmap;
649 vcpu->stat.instruction_essa++;
650 if (!kvm_enabled_cmma() || !vcpu->arch.sie_block->cbrlo)
651 return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
652
653 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
654 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
655
656 if (((vcpu->arch.sie_block->ipb & 0xf0000000) >> 28) > 6)
657 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
658
659 /* Rewind PSW to repeat the ESSA instruction */
660 vcpu->arch.sie_block->gpsw.addr =
661 __rewind_psw(vcpu->arch.sie_block->gpsw, 4);
662 vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */
663 cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
664 down_read(&gmap->mm->mmap_sem);
665 for (i = 0; i < entries; ++i) {
666 cbrle = cbrlo[i];
667 if (unlikely(cbrle & ~PAGE_MASK || cbrle < 2 * PAGE_SIZE))
668 /* invalid entry */
669 break;
670 /* try to free backing */
671 __gmap_zap(cbrle, gmap);
672 }
673 up_read(&gmap->mm->mmap_sem);
674 if (i < entries)
675 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
676 return 0;
677}
678
639static const intercept_handler_t b9_handlers[256] = { 679static const intercept_handler_t b9_handlers[256] = {
640 [0x8d] = handle_epsw, 680 [0x8d] = handle_epsw,
681 [0xab] = handle_essa,
641 [0xaf] = handle_pfmf, 682 [0xaf] = handle_pfmf,
642}; 683};
643 684
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 3584ed9b20a1..9e2b4705dea2 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -17,6 +17,7 @@
17#include <linux/quicklist.h> 17#include <linux/quicklist.h>
18#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/swapops.h>
20 21
21#include <asm/pgtable.h> 22#include <asm/pgtable.h>
22#include <asm/pgalloc.h> 23#include <asm/pgalloc.h>
@@ -594,6 +595,82 @@ unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
594} 595}
595EXPORT_SYMBOL_GPL(gmap_fault); 596EXPORT_SYMBOL_GPL(gmap_fault);
596 597
598static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm)
599{
600 if (!non_swap_entry(entry))
601 dec_mm_counter(mm, MM_SWAPENTS);
602 else if (is_migration_entry(entry)) {
603 struct page *page = migration_entry_to_page(entry);
604
605 if (PageAnon(page))
606 dec_mm_counter(mm, MM_ANONPAGES);
607 else
608 dec_mm_counter(mm, MM_FILEPAGES);
609 }
610 free_swap_and_cache(entry);
611}
612
613/**
614 * The mm->mmap_sem lock must be held
615 */
616static void gmap_zap_unused(struct mm_struct *mm, unsigned long address)
617{
618 unsigned long ptev, pgstev;
619 spinlock_t *ptl;
620 pgste_t pgste;
621 pte_t *ptep, pte;
622
623 ptep = get_locked_pte(mm, address, &ptl);
624 if (unlikely(!ptep))
625 return;
626 pte = *ptep;
627 if (!pte_swap(pte))
628 goto out_pte;
629 /* Zap unused and logically-zero pages */
630 pgste = pgste_get_lock(ptep);
631 pgstev = pgste_val(pgste);
632 ptev = pte_val(pte);
633 if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
634 ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) {
635 gmap_zap_swap_entry(pte_to_swp_entry(pte), mm);
636 pte_clear(mm, address, ptep);
637 }
638 pgste_set_unlock(ptep, pgste);
639out_pte:
640 pte_unmap_unlock(*ptep, ptl);
641}
642
643/*
644 * this function is assumed to be called with mmap_sem held
645 */
646void __gmap_zap(unsigned long address, struct gmap *gmap)
647{
648 unsigned long *table, *segment_ptr;
649 unsigned long segment, pgstev, ptev;
650 struct gmap_pgtable *mp;
651 struct page *page;
652
653 segment_ptr = gmap_table_walk(address, gmap);
654 if (IS_ERR(segment_ptr))
655 return;
656 segment = *segment_ptr;
657 if (segment & _SEGMENT_ENTRY_INVALID)
658 return;
659 page = pfn_to_page(segment >> PAGE_SHIFT);
660 mp = (struct gmap_pgtable *) page->index;
661 address = mp->vmaddr | (address & ~PMD_MASK);
662 /* Page table is present */
663 table = (unsigned long *)(segment & _SEGMENT_ENTRY_ORIGIN);
664 table = table + ((address >> 12) & 0xff);
665 pgstev = table[PTRS_PER_PTE];
666 ptev = table[0];
667 /* quick check, checked again with locks held */
668 if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
669 ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID)))
670 gmap_zap_unused(gmap->mm, address);
671}
672EXPORT_SYMBOL_GPL(__gmap_zap);
673
597void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap) 674void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap)
598{ 675{
599 676