diff options
author | Paul Mackerras <paulus@samba.org> | 2011-06-28 20:23:08 -0400 |
---|---|---|
committer | Avi Kivity <avi@redhat.com> | 2011-07-12 06:16:57 -0400 |
commit | 371fefd6f2dc46668e00871930dde613b88d4bde (patch) | |
tree | 35fe799343861405914d27873eb175eb04d6dce5 /arch/powerpc/kvm/book3s_hv_rmhandlers.S | |
parent | 54738c097163c3f01e67ccc85462b78d4d4f495f (diff) |
KVM: PPC: Allow book3s_hv guests to use SMT processor modes
This lifts the restriction that book3s_hv guests can only run one
hardware thread per core, and allows them to use up to 4 threads
per core on POWER7. The host still has to run single-threaded.
This capability is advertised to qemu through a new KVM_CAP_PPC_SMT
capability. The return value of the ioctl querying this capability
is the number of vcpus per virtual CPU core (vcore), currently 4.
To use this, the host kernel should be booted with all threads
active, and then all the secondary threads should be offlined.
This will put the secondary threads into nap mode. KVM will then
wake them from nap mode and use them for running guest code (while
they are still offline). To wake the secondary threads, we send
them an IPI using a new xics_wake_cpu() function, implemented in
arch/powerpc/sysdev/xics/icp-native.c. In other words, at this stage
we assume that the platform has a XICS interrupt controller and
we are using icp-native.c to drive it. Since the woken thread will
need to acknowledge and clear the IPI, we also export the base
physical address of the XICS registers using kvmppc_set_xics_phys()
for use in the low-level KVM book3s code.
When a vcpu is created, it is assigned to a virtual CPU core.
The vcore number is obtained by dividing the vcpu number by the
number of threads per core in the host. This number is exported
to userspace via the KVM_CAP_PPC_SMT capability. If qemu wishes
to run the guest in single-threaded mode, it should make all vcpu
numbers be multiples of the number of threads per core.
We distinguish three states of a vcpu: runnable (i.e., ready to execute
the guest), blocked (that is, idle), and busy in host. We currently
implement a policy that the vcore can run only when all its threads
are runnable or blocked. This way, if a vcpu needs to execute elsewhere
in the kernel or in qemu, it can do so without being starved of CPU
by the other vcpus.
When a vcore starts to run, it executes in the context of one of the
vcpu threads. The other vcpu threads all go to sleep and stay asleep
until something happens requiring the vcpu thread to return to qemu,
or to wake up to run the vcore (this can happen when another vcpu
thread goes from busy in host state to blocked).
It can happen that a vcpu goes from blocked to runnable state (e.g.
because of an interrupt), and the vcore it belongs to is already
running. In that case it can start to run immediately as long as
the none of the vcpus in the vcore have started to exit the guest.
We send the next free thread in the vcore an IPI to get it to start
to execute the guest. It synchronizes with the other threads via
the vcore->entry_exit_count field to make sure that it doesn't go
into the guest if the other vcpus are exiting by the time that it
is ready to actually enter the guest.
Note that there is no fixed relationship between the hardware thread
number and the vcpu number. Hardware threads are assigned to vcpus
as they become runnable, so we will always use the lower-numbered
hardware threads in preference to higher-numbered threads if not all
the vcpus in the vcore are runnable, regardless of which vcpus are
runnable.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
Diffstat (limited to 'arch/powerpc/kvm/book3s_hv_rmhandlers.S')
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_rmhandlers.S | 168 |
1 files changed, 162 insertions, 6 deletions
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index e6adaadcdff2..c9bf177b7cf2 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S | |||
@@ -30,8 +30,6 @@ | |||
30 | * * | 30 | * * |
31 | ****************************************************************************/ | 31 | ****************************************************************************/ |
32 | 32 | ||
33 | #define SHADOW_VCPU_OFF PACA_KVM_SVCPU | ||
34 | |||
35 | .globl kvmppc_skip_interrupt | 33 | .globl kvmppc_skip_interrupt |
36 | kvmppc_skip_interrupt: | 34 | kvmppc_skip_interrupt: |
37 | mfspr r13,SPRN_SRR0 | 35 | mfspr r13,SPRN_SRR0 |
@@ -79,6 +77,32 @@ _GLOBAL(kvmppc_hv_entry_trampoline) | |||
79 | * * | 77 | * * |
80 | *****************************************************************************/ | 78 | *****************************************************************************/ |
81 | 79 | ||
80 | #define XICS_XIRR 4 | ||
81 | #define XICS_QIRR 0xc | ||
82 | |||
83 | /* | ||
84 | * We come in here when wakened from nap mode on a secondary hw thread. | ||
85 | * Relocation is off and most register values are lost. | ||
86 | * r13 points to the PACA. | ||
87 | */ | ||
88 | .globl kvm_start_guest | ||
89 | kvm_start_guest: | ||
90 | ld r1,PACAEMERGSP(r13) | ||
91 | subi r1,r1,STACK_FRAME_OVERHEAD | ||
92 | |||
93 | /* get vcpu pointer */ | ||
94 | ld r4, HSTATE_KVM_VCPU(r13) | ||
95 | |||
96 | /* We got here with an IPI; clear it */ | ||
97 | ld r5, HSTATE_XICS_PHYS(r13) | ||
98 | li r0, 0xff | ||
99 | li r6, XICS_QIRR | ||
100 | li r7, XICS_XIRR | ||
101 | lwzcix r8, r5, r7 /* ack the interrupt */ | ||
102 | sync | ||
103 | stbcix r0, r5, r6 /* clear it */ | ||
104 | stwcix r8, r5, r7 /* EOI it */ | ||
105 | |||
82 | .global kvmppc_hv_entry | 106 | .global kvmppc_hv_entry |
83 | kvmppc_hv_entry: | 107 | kvmppc_hv_entry: |
84 | 108 | ||
@@ -200,7 +224,20 @@ kvmppc_hv_entry: | |||
200 | slbia | 224 | slbia |
201 | ptesync | 225 | ptesync |
202 | 226 | ||
203 | /* Switch to guest partition. */ | 227 | /* Increment entry count iff exit count is zero. */ |
228 | ld r5,HSTATE_KVM_VCORE(r13) | ||
229 | addi r9,r5,VCORE_ENTRY_EXIT | ||
230 | 21: lwarx r3,0,r9 | ||
231 | cmpwi r3,0x100 /* any threads starting to exit? */ | ||
232 | bge secondary_too_late /* if so we're too late to the party */ | ||
233 | addi r3,r3,1 | ||
234 | stwcx. r3,0,r9 | ||
235 | bne 21b | ||
236 | |||
237 | /* Primary thread switches to guest partition. */ | ||
238 | lwz r6,VCPU_PTID(r4) | ||
239 | cmpwi r6,0 | ||
240 | bne 20f | ||
204 | ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ | 241 | ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ |
205 | ld r6,KVM_SDR1(r9) | 242 | ld r6,KVM_SDR1(r9) |
206 | lwz r7,KVM_LPID(r9) | 243 | lwz r7,KVM_LPID(r9) |
@@ -210,7 +247,15 @@ kvmppc_hv_entry: | |||
210 | mtspr SPRN_SDR1,r6 /* switch to partition page table */ | 247 | mtspr SPRN_SDR1,r6 /* switch to partition page table */ |
211 | mtspr SPRN_LPID,r7 | 248 | mtspr SPRN_LPID,r7 |
212 | isync | 249 | isync |
213 | ld r8,VCPU_LPCR(r4) | 250 | li r0,1 |
251 | stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ | ||
252 | b 10f | ||
253 | |||
254 | /* Secondary threads wait for primary to have done partition switch */ | ||
255 | 20: lbz r0,VCORE_IN_GUEST(r5) | ||
256 | cmpwi r0,0 | ||
257 | beq 20b | ||
258 | 10: ld r8,VCPU_LPCR(r4) | ||
214 | mtspr SPRN_LPCR,r8 | 259 | mtspr SPRN_LPCR,r8 |
215 | isync | 260 | isync |
216 | 261 | ||
@@ -225,10 +270,12 @@ kvmppc_hv_entry: | |||
225 | * Invalidate the TLB if we could possibly have stale TLB | 270 | * Invalidate the TLB if we could possibly have stale TLB |
226 | * entries for this partition on this core due to the use | 271 | * entries for this partition on this core due to the use |
227 | * of tlbiel. | 272 | * of tlbiel. |
273 | * XXX maybe only need this on primary thread? | ||
228 | */ | 274 | */ |
229 | ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ | 275 | ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ |
230 | lwz r5,VCPU_VCPUID(r4) | 276 | lwz r5,VCPU_VCPUID(r4) |
231 | lhz r6,PACAPACAINDEX(r13) | 277 | lhz r6,PACAPACAINDEX(r13) |
278 | rldimi r6,r5,0,62 /* XXX map as if threads 1:1 p:v */ | ||
232 | lhz r8,VCPU_LAST_CPU(r4) | 279 | lhz r8,VCPU_LAST_CPU(r4) |
233 | sldi r7,r6,1 /* see if this is the same vcpu */ | 280 | sldi r7,r6,1 /* see if this is the same vcpu */ |
234 | add r7,r7,r9 /* as last ran on this pcpu */ | 281 | add r7,r7,r9 /* as last ran on this pcpu */ |
@@ -512,8 +559,60 @@ hcall_real_cont: | |||
512 | ptesync | 559 | ptesync |
513 | 560 | ||
514 | hdec_soon: | 561 | hdec_soon: |
515 | /* Switch back to host partition */ | 562 | /* Increment the threads-exiting-guest count in the 0xff00 |
563 | bits of vcore->entry_exit_count */ | ||
564 | lwsync | ||
565 | ld r5,HSTATE_KVM_VCORE(r13) | ||
566 | addi r6,r5,VCORE_ENTRY_EXIT | ||
567 | 41: lwarx r3,0,r6 | ||
568 | addi r0,r3,0x100 | ||
569 | stwcx. r0,0,r6 | ||
570 | bne 41b | ||
571 | |||
572 | /* | ||
573 | * At this point we have an interrupt that we have to pass | ||
574 | * up to the kernel or qemu; we can't handle it in real mode. | ||
575 | * Thus we have to do a partition switch, so we have to | ||
576 | * collect the other threads, if we are the first thread | ||
577 | * to take an interrupt. To do this, we set the HDEC to 0, | ||
578 | * which causes an HDEC interrupt in all threads within 2ns | ||
579 | * because the HDEC register is shared between all 4 threads. | ||
580 | * However, we don't need to bother if this is an HDEC | ||
581 | * interrupt, since the other threads will already be on their | ||
582 | * way here in that case. | ||
583 | */ | ||
584 | cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER | ||
585 | beq 40f | ||
586 | cmpwi r3,0x100 /* Are we the first here? */ | ||
587 | bge 40f | ||
588 | cmpwi r3,1 | ||
589 | ble 40f | ||
590 | li r0,0 | ||
591 | mtspr SPRN_HDEC,r0 | ||
592 | 40: | ||
593 | |||
594 | /* Secondary threads wait for primary to do partition switch */ | ||
516 | ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ | 595 | ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ |
596 | ld r5,HSTATE_KVM_VCORE(r13) | ||
597 | lwz r3,VCPU_PTID(r9) | ||
598 | cmpwi r3,0 | ||
599 | beq 15f | ||
600 | HMT_LOW | ||
601 | 13: lbz r3,VCORE_IN_GUEST(r5) | ||
602 | cmpwi r3,0 | ||
603 | bne 13b | ||
604 | HMT_MEDIUM | ||
605 | b 16f | ||
606 | |||
607 | /* Primary thread waits for all the secondaries to exit guest */ | ||
608 | 15: lwz r3,VCORE_ENTRY_EXIT(r5) | ||
609 | srwi r0,r3,8 | ||
610 | clrldi r3,r3,56 | ||
611 | cmpw r3,r0 | ||
612 | bne 15b | ||
613 | isync | ||
614 | |||
615 | /* Primary thread switches back to host partition */ | ||
517 | ld r6,KVM_HOST_SDR1(r4) | 616 | ld r6,KVM_HOST_SDR1(r4) |
518 | lwz r7,KVM_HOST_LPID(r4) | 617 | lwz r7,KVM_HOST_LPID(r4) |
519 | li r8,LPID_RSVD /* switch to reserved LPID */ | 618 | li r8,LPID_RSVD /* switch to reserved LPID */ |
@@ -522,10 +621,12 @@ hdec_soon: | |||
522 | mtspr SPRN_SDR1,r6 /* switch to partition page table */ | 621 | mtspr SPRN_SDR1,r6 /* switch to partition page table */ |
523 | mtspr SPRN_LPID,r7 | 622 | mtspr SPRN_LPID,r7 |
524 | isync | 623 | isync |
624 | li r0,0 | ||
625 | stb r0,VCORE_IN_GUEST(r5) | ||
525 | lis r8,0x7fff /* MAX_INT@h */ | 626 | lis r8,0x7fff /* MAX_INT@h */ |
526 | mtspr SPRN_HDEC,r8 | 627 | mtspr SPRN_HDEC,r8 |
527 | 628 | ||
528 | ld r8,KVM_HOST_LPCR(r4) | 629 | 16: ld r8,KVM_HOST_LPCR(r4) |
529 | mtspr SPRN_LPCR,r8 | 630 | mtspr SPRN_LPCR,r8 |
530 | isync | 631 | isync |
531 | 632 | ||
@@ -634,6 +735,11 @@ hdec_soon: | |||
634 | mr r3, r9 | 735 | mr r3, r9 |
635 | bl .kvmppc_save_fp | 736 | bl .kvmppc_save_fp |
636 | 737 | ||
738 | /* Secondary threads go off to take a nap */ | ||
739 | lwz r0,VCPU_PTID(r3) | ||
740 | cmpwi r0,0 | ||
741 | bne secondary_nap | ||
742 | |||
637 | /* | 743 | /* |
638 | * Reload DEC. HDEC interrupts were disabled when | 744 | * Reload DEC. HDEC interrupts were disabled when |
639 | * we reloaded the host's LPCR value. | 745 | * we reloaded the host's LPCR value. |
@@ -840,6 +946,56 @@ _GLOBAL(kvmppc_h_set_dabr) | |||
840 | li r3,0 | 946 | li r3,0 |
841 | blr | 947 | blr |
842 | 948 | ||
949 | secondary_too_late: | ||
950 | ld r5,HSTATE_KVM_VCORE(r13) | ||
951 | HMT_LOW | ||
952 | 13: lbz r3,VCORE_IN_GUEST(r5) | ||
953 | cmpwi r3,0 | ||
954 | bne 13b | ||
955 | HMT_MEDIUM | ||
956 | ld r11,PACA_SLBSHADOWPTR(r13) | ||
957 | |||
958 | .rept SLB_NUM_BOLTED | ||
959 | ld r5,SLBSHADOW_SAVEAREA(r11) | ||
960 | ld r6,SLBSHADOW_SAVEAREA+8(r11) | ||
961 | andis. r7,r5,SLB_ESID_V@h | ||
962 | beq 1f | ||
963 | slbmte r6,r5 | ||
964 | 1: addi r11,r11,16 | ||
965 | .endr | ||
966 | b 50f | ||
967 | |||
968 | secondary_nap: | ||
969 | /* Clear any pending IPI */ | ||
970 | 50: ld r5, HSTATE_XICS_PHYS(r13) | ||
971 | li r0, 0xff | ||
972 | li r6, XICS_QIRR | ||
973 | stbcix r0, r5, r6 | ||
974 | |||
975 | /* increment the nap count and then go to nap mode */ | ||
976 | ld r4, HSTATE_KVM_VCORE(r13) | ||
977 | addi r4, r4, VCORE_NAP_COUNT | ||
978 | lwsync /* make previous updates visible */ | ||
979 | 51: lwarx r3, 0, r4 | ||
980 | addi r3, r3, 1 | ||
981 | stwcx. r3, 0, r4 | ||
982 | bne 51b | ||
983 | isync | ||
984 | |||
985 | mfspr r4, SPRN_LPCR | ||
986 | li r0, LPCR_PECE | ||
987 | andc r4, r4, r0 | ||
988 | ori r4, r4, LPCR_PECE0 /* exit nap on interrupt */ | ||
989 | mtspr SPRN_LPCR, r4 | ||
990 | li r0, 0 | ||
991 | std r0, HSTATE_SCRATCH0(r13) | ||
992 | ptesync | ||
993 | ld r0, HSTATE_SCRATCH0(r13) | ||
994 | 1: cmpd r0, r0 | ||
995 | bne 1b | ||
996 | nap | ||
997 | b . | ||
998 | |||
843 | /* | 999 | /* |
844 | * Save away FP, VMX and VSX registers. | 1000 | * Save away FP, VMX and VSX registers. |
845 | * r3 = vcpu pointer | 1001 | * r3 = vcpu pointer |