aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStewart Smith <stewart@linux.vnet.ibm.com>2014-07-18 00:18:43 -0400
committerAlexander Graf <agraf@suse.de>2014-07-28 09:23:17 -0400
commit9678cdaae93932473f696fdea5debf3eee1e1260 (patch)
tree5ee47b4c46a83eeaf59822cc53fa5db720ad5b96
parentde9bdd1a604d30b4e05dc18b5cc6354949253abd (diff)
Use the POWER8 Micro Partition Prefetch Engine in KVM HV on POWER8
The POWER8 processor has a Micro Partition Prefetch Engine, which is a fancy way of saying "has way to store and load contents of L2 or L2+MRU way of L3 cache". We initiate the storing of the log (list of addresses) using the logmpp instruction and start restore by writing to a SPR. The logmpp instruction takes parameters in a single 64bit register: - starting address of the table to store log of L2/L2+L3 cache contents - 32kb for L2 - 128kb for L2+L3 - Aligned relative to maximum size of the table (32kb or 128kb) - Log control (no-op, L2 only, L2 and L3, abort logout) We should abort any ongoing logging before initiating one. To initiate restore, we write to the MPPR SPR. The format of what to write to the SPR is similar to the logmpp instruction parameter: - starting address of the table to read from (same alignment requirements) - table size (no data, until end of table) - prefetch rate (from fastest possible to slower. about every 8, 16, 24 or 32 cycles) The idea behind loading and storing the contents of L2/L3 cache is to reduce memory latency in a system that is frequently swapping vcores on a physical CPU. The best case scenario for doing this is when some vcores are doing very cache heavy workloads. The worst case is when they have about 0 cache hits, so we just generate needless memory operations. This implementation just does L2 store/load. In my benchmarks this proves to be useful. Benchmark 1: - 16 core POWER8 - 3x Ubuntu 14.04LTS guests (LE) with 8 VCPUs each - No split core/SMT - two guests running sysbench memory test. sysbench --test=memory --num-threads=8 run - one guest running apache bench (of default HTML page) ab -n 490000 -c 400 http://localhost/ This benchmark aims to measure performance of real world application (apache) where other guests are cache hot with their own workloads. The sysbench memory benchmark does pointer sized writes to a (small) memory buffer in a loop. In this benchmark with this patch I can see an improvement both in requests per second (~5%) and in mean and median response times (again, about 5%). The spread of minimum and maximum response times were largely unchanged. benchmark 2: - Same VM config as benchmark 1 - all three guests running sysbench memory benchmark This benchmark aims to see if there is a positive or negative affect to this cache heavy benchmark. Although due to the nature of the benchmark (stores) we may not see a difference in performance, but rather hopefully an improvement in consistency of performance (when vcore switched in, don't have to wait many times for cachelines to be pulled in) The results of this benchmark are improvements in consistency of performance rather than performance itself. With this patch, the few outliers in duration go away and we get more consistent performance in each guest. benchmark 3: - same 3 guests and CPU configuration as benchmark 1 and 2. - two idle guests - 1 guest running STREAM benchmark This scenario also saw performance improvement with this patch. On Copy and Scale workloads from STREAM, I got 5-6% improvement with this patch. For Add and triad, it was around 10% (or more). benchmark 4: - same 3 guests as previous benchmarks - two guests running sysbench --memory, distinctly different cache heavy workload - one guest running STREAM benchmark. Similar improvements to benchmark 3. benchmark 5: - 1 guest, 8 VCPUs, Ubuntu 14.04 - Host configured with split core (SMT8, subcores-per-core=4) - STREAM benchmark In this benchmark, we see a 10-20% performance improvement across the board of STREAM benchmark results with this patch. Based on preliminary investigation and microbenchmarks by Prerna Saxena <prerna@linux.vnet.ibm.com> Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com> Acked-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Alexander Graf <agraf@suse.de>
-rw-r--r--arch/powerpc/include/asm/cache.h7
-rw-r--r--arch/powerpc/include/asm/kvm_host.h2
-rw-r--r--arch/powerpc/include/asm/ppc-opcode.h17
-rw-r--r--arch/powerpc/include/asm/reg.h1
-rw-r--r--arch/powerpc/kvm/book3s_hv.c57
5 files changed, 83 insertions, 1 deletions
diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
index ed0afc1e44a4..34a05a1a990b 100644
--- a/arch/powerpc/include/asm/cache.h
+++ b/arch/powerpc/include/asm/cache.h
@@ -3,6 +3,7 @@
3 3
4#ifdef __KERNEL__ 4#ifdef __KERNEL__
5 5
6#include <asm/reg.h>
6 7
7/* bytes per L1 cache line */ 8/* bytes per L1 cache line */
8#if defined(CONFIG_8xx) || defined(CONFIG_403GCX) 9#if defined(CONFIG_8xx) || defined(CONFIG_403GCX)
@@ -39,6 +40,12 @@ struct ppc64_caches {
39}; 40};
40 41
41extern struct ppc64_caches ppc64_caches; 42extern struct ppc64_caches ppc64_caches;
43
44static inline void logmpp(u64 x)
45{
46 asm volatile(PPC_LOGMPP(R1) : : "r" (x));
47}
48
42#endif /* __powerpc64__ && ! __ASSEMBLY__ */ 49#endif /* __powerpc64__ && ! __ASSEMBLY__ */
43 50
44#if defined(__ASSEMBLY__) 51#if defined(__ASSEMBLY__)
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 5fe2b5d17bc0..11385bb46527 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -307,6 +307,8 @@ struct kvmppc_vcore {
307 u32 arch_compat; 307 u32 arch_compat;
308 ulong pcr; 308 ulong pcr;
309 ulong dpdes; /* doorbell state (POWER8) */ 309 ulong dpdes; /* doorbell state (POWER8) */
310 void *mpp_buffer; /* Micro Partition Prefetch buffer */
311 bool mpp_buffer_is_valid;
310}; 312};
311 313
312#define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff) 314#define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff)
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 3132bb9365f3..c636841fc772 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -139,6 +139,7 @@
139#define PPC_INST_ISEL 0x7c00001e 139#define PPC_INST_ISEL 0x7c00001e
140#define PPC_INST_ISEL_MASK 0xfc00003e 140#define PPC_INST_ISEL_MASK 0xfc00003e
141#define PPC_INST_LDARX 0x7c0000a8 141#define PPC_INST_LDARX 0x7c0000a8
142#define PPC_INST_LOGMPP 0x7c0007e4
142#define PPC_INST_LSWI 0x7c0004aa 143#define PPC_INST_LSWI 0x7c0004aa
143#define PPC_INST_LSWX 0x7c00042a 144#define PPC_INST_LSWX 0x7c00042a
144#define PPC_INST_LWARX 0x7c000028 145#define PPC_INST_LWARX 0x7c000028
@@ -275,6 +276,20 @@
275#define __PPC_EH(eh) 0 276#define __PPC_EH(eh) 0
276#endif 277#endif
277 278
279/* POWER8 Micro Partition Prefetch (MPP) parameters */
280/* Address mask is common for LOGMPP instruction and MPPR SPR */
281#define PPC_MPPE_ADDRESS_MASK 0xffffffffc000
282
283/* Bits 60 and 61 of MPP SPR should be set to one of the following */
284/* Aborting the fetch is indeed setting 00 in the table size bits */
285#define PPC_MPPR_FETCH_ABORT (0x0ULL << 60)
286#define PPC_MPPR_FETCH_WHOLE_TABLE (0x2ULL << 60)
287
288/* Bits 54 and 55 of register for LOGMPP instruction should be set to: */
289#define PPC_LOGMPP_LOG_L2 (0x02ULL << 54)
290#define PPC_LOGMPP_LOG_L2L3 (0x01ULL << 54)
291#define PPC_LOGMPP_LOG_ABORT (0x03ULL << 54)
292
278/* Deal with instructions that older assemblers aren't aware of */ 293/* Deal with instructions that older assemblers aren't aware of */
279#define PPC_DCBAL(a, b) stringify_in_c(.long PPC_INST_DCBAL | \ 294#define PPC_DCBAL(a, b) stringify_in_c(.long PPC_INST_DCBAL | \
280 __PPC_RA(a) | __PPC_RB(b)) 295 __PPC_RA(a) | __PPC_RB(b))
@@ -283,6 +298,8 @@
283#define PPC_LDARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LDARX | \ 298#define PPC_LDARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LDARX | \
284 ___PPC_RT(t) | ___PPC_RA(a) | \ 299 ___PPC_RT(t) | ___PPC_RA(a) | \
285 ___PPC_RB(b) | __PPC_EH(eh)) 300 ___PPC_RB(b) | __PPC_EH(eh))
301#define PPC_LOGMPP(b) stringify_in_c(.long PPC_INST_LOGMPP | \
302 __PPC_RB(b))
286#define PPC_LWARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LWARX | \ 303#define PPC_LWARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LWARX | \
287 ___PPC_RT(t) | ___PPC_RA(a) | \ 304 ___PPC_RT(t) | ___PPC_RA(a) | \
288 ___PPC_RB(b) | __PPC_EH(eh)) 305 ___PPC_RB(b) | __PPC_EH(eh))
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 0ef17ade7730..c547b26371b8 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -225,6 +225,7 @@
225#define CTRL_TE 0x00c00000 /* thread enable */ 225#define CTRL_TE 0x00c00000 /* thread enable */
226#define CTRL_RUNLATCH 0x1 226#define CTRL_RUNLATCH 0x1
227#define SPRN_DAWR 0xB4 227#define SPRN_DAWR 0xB4
228#define SPRN_MPPR 0xB8 /* Micro Partition Prefetch Register */
228#define SPRN_RPR 0xBA /* Relative Priority Register */ 229#define SPRN_RPR 0xBA /* Relative Priority Register */
229#define SPRN_CIABR 0xBB 230#define SPRN_CIABR 0xBB
230#define CIABR_PRIV 0x3 231#define CIABR_PRIV 0x3
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 5042cccd9fde..c470d55cffe1 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -35,6 +35,7 @@
35 35
36#include <asm/reg.h> 36#include <asm/reg.h>
37#include <asm/cputable.h> 37#include <asm/cputable.h>
38#include <asm/cache.h>
38#include <asm/cacheflush.h> 39#include <asm/cacheflush.h>
39#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
@@ -69,6 +70,13 @@
69 70
70static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1); 71static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
71 72
73#if defined(CONFIG_PPC_64K_PAGES)
74#define MPP_BUFFER_ORDER 0
75#elif defined(CONFIG_PPC_4K_PAGES)
76#define MPP_BUFFER_ORDER 3
77#endif
78
79
72static void kvmppc_end_cede(struct kvm_vcpu *vcpu); 80static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
73static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); 81static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
74 82
@@ -1320,6 +1328,13 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
1320 vcore->first_vcpuid = core * threads_per_subcore; 1328 vcore->first_vcpuid = core * threads_per_subcore;
1321 vcore->kvm = kvm; 1329 vcore->kvm = kvm;
1322 1330
1331 vcore->mpp_buffer_is_valid = false;
1332
1333 if (cpu_has_feature(CPU_FTR_ARCH_207S))
1334 vcore->mpp_buffer = (void *)__get_free_pages(
1335 GFP_KERNEL|__GFP_ZERO,
1336 MPP_BUFFER_ORDER);
1337
1323 return vcore; 1338 return vcore;
1324} 1339}
1325 1340
@@ -1586,6 +1601,33 @@ static int on_primary_thread(void)
1586 return 1; 1601 return 1;
1587} 1602}
1588 1603
1604static void kvmppc_start_saving_l2_cache(struct kvmppc_vcore *vc)
1605{
1606 phys_addr_t phy_addr, mpp_addr;
1607
1608 phy_addr = (phys_addr_t)virt_to_phys(vc->mpp_buffer);
1609 mpp_addr = phy_addr & PPC_MPPE_ADDRESS_MASK;
1610
1611 mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_ABORT);
1612 logmpp(mpp_addr | PPC_LOGMPP_LOG_L2);
1613
1614 vc->mpp_buffer_is_valid = true;
1615}
1616
1617static void kvmppc_start_restoring_l2_cache(const struct kvmppc_vcore *vc)
1618{
1619 phys_addr_t phy_addr, mpp_addr;
1620
1621 phy_addr = virt_to_phys(vc->mpp_buffer);
1622 mpp_addr = phy_addr & PPC_MPPE_ADDRESS_MASK;
1623
1624 /* We must abort any in-progress save operations to ensure
1625 * the table is valid so that prefetch engine knows when to
1626 * stop prefetching. */
1627 logmpp(mpp_addr | PPC_LOGMPP_LOG_ABORT);
1628 mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE);
1629}
1630
1589/* 1631/*
1590 * Run a set of guest threads on a physical core. 1632 * Run a set of guest threads on a physical core.
1591 * Called with vc->lock held. 1633 * Called with vc->lock held.
@@ -1663,9 +1705,16 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
1663 1705
1664 srcu_idx = srcu_read_lock(&vc->kvm->srcu); 1706 srcu_idx = srcu_read_lock(&vc->kvm->srcu);
1665 1707
1708 if (vc->mpp_buffer_is_valid)
1709 kvmppc_start_restoring_l2_cache(vc);
1710
1666 __kvmppc_vcore_entry(); 1711 __kvmppc_vcore_entry();
1667 1712
1668 spin_lock(&vc->lock); 1713 spin_lock(&vc->lock);
1714
1715 if (vc->mpp_buffer)
1716 kvmppc_start_saving_l2_cache(vc);
1717
1669 /* disable sending of IPIs on virtual external irqs */ 1718 /* disable sending of IPIs on virtual external irqs */
1670 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 1719 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
1671 vcpu->cpu = -1; 1720 vcpu->cpu = -1;
@@ -2413,8 +2462,14 @@ static void kvmppc_free_vcores(struct kvm *kvm)
2413{ 2462{
2414 long int i; 2463 long int i;
2415 2464
2416 for (i = 0; i < KVM_MAX_VCORES; ++i) 2465 for (i = 0; i < KVM_MAX_VCORES; ++i) {
2466 if (kvm->arch.vcores[i] && kvm->arch.vcores[i]->mpp_buffer) {
2467 struct kvmppc_vcore *vc = kvm->arch.vcores[i];
2468 free_pages((unsigned long)vc->mpp_buffer,
2469 MPP_BUFFER_ORDER);
2470 }
2417 kfree(kvm->arch.vcores[i]); 2471 kfree(kvm->arch.vcores[i]);
2472 }
2418 kvm->arch.online_vcores = 0; 2473 kvm->arch.online_vcores = 0;
2419} 2474}
2420 2475