aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorAnton Blanchard <anton@samba.org>2006-01-10 21:16:44 -0500
committerPaul Mackerras <paulus@samba.org>2006-01-10 22:49:45 -0500
commit7a0268fa1a3613f2c526a9b3058701b277f6abe1 (patch)
tree738bf94d9bb5d68d260805dbc1898ec40ebc20e5 /arch
parent193cac99f6d8604aca71e5a966a8cd1dfb84819d (diff)
[PATCH] powerpc/64: per cpu data optimisations
The current ppc64 per cpu data implementation is quite slow. eg: lhz 11,18(13) /* smp_processor_id() */ ld 9,.LC63-.LCTOC1(30) /* per_cpu__variable_name */ ld 8,.LC61-.LCTOC1(30) /* __per_cpu_offset */ sldi 11,11,3 /* form index into __per_cpu_offset */ mr 10,9 ldx 9,11,8 /* __per_cpu_offset[smp_processor_id()] */ ldx 0,10,9 /* load per cpu data */ 5 loads for something that is supposed to be fast, pretty awful. One reason for the large number of loads is that we have to synthesize 2 64bit constants (per_cpu__variable_name and __per_cpu_offset). By putting __per_cpu_offset into the paca we can avoid the 2 loads associated with it: ld 11,56(13) /* paca->data_offset */ ld 9,.LC59-.LCTOC1(30) /* per_cpu__variable_name */ ldx 0,9,11 /* load per cpu data Longer term we can should be able to do even better than 3 loads. If per_cpu__variable_name wasnt a 64bit constant and paca->data_offset was in a register we could cut it down to one load. A suggestion from Rusty is to use gcc's __thread extension here. In order to do this we would need to free up r13 (the __thread register and where the paca currently is). So far Ive had a few unsuccessful attempts at doing that :) The patch also allocates per cpu memory node local on NUMA machines. This patch from Rusty has been sitting in my queue _forever_ but stalled when I hit the compiler bug. Sorry about that. Finally I also only allocate per cpu data for possible cpus, which comes straight out of the x86-64 port. On a pseries kernel (with NR_CPUS == 128) and 4 possible cpus we see some nice gains: total used free shared buffers cached Mem: 4012228 212860 3799368 0 0 162424 total used free shared buffers cached Mem: 4016200 212984 3803216 0 0 162424 A saving of 3.75MB. Quite nice for smaller machines. Note: we now have to be careful of per cpu users that touch data for !possible cpus. At this stage it might be worth making the NUMA and possible cpu optimisations generic, but per cpu init is done so early we have to be careful that all architectures have their possible map setup correctly. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/powerpc/kernel/setup_64.c26
1 files changed, 26 insertions, 0 deletions
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 0420418f317a..e29b275e09e0 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -33,6 +33,7 @@
33#include <linux/unistd.h> 33#include <linux/unistd.h>
34#include <linux/serial.h> 34#include <linux/serial.h>
35#include <linux/serial_8250.h> 35#include <linux/serial_8250.h>
36#include <linux/bootmem.h>
36#include <asm/io.h> 37#include <asm/io.h>
37#include <asm/kdump.h> 38#include <asm/kdump.h>
38#include <asm/prom.h> 39#include <asm/prom.h>
@@ -654,3 +655,28 @@ void cpu_die(void)
654 if (ppc_md.cpu_die) 655 if (ppc_md.cpu_die)
655 ppc_md.cpu_die(); 656 ppc_md.cpu_die();
656} 657}
658
659#ifdef CONFIG_SMP
660void __init setup_per_cpu_areas(void)
661{
662 int i;
663 unsigned long size;
664 char *ptr;
665
666 /* Copy section for each CPU (we discard the original) */
667 size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
668#ifdef CONFIG_MODULES
669 if (size < PERCPU_ENOUGH_ROOM)
670 size = PERCPU_ENOUGH_ROOM;
671#endif
672
673 for_each_cpu(i) {
674 ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
675 if (!ptr)
676 panic("Cannot allocate cpu data for CPU %d\n", i);
677
678 paca[i].data_offset = ptr - __per_cpu_start;
679 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
680 }
681}
682#endif