diff options
author | Emil Medve <Emilian.Medve@Freescale.com> | 2007-11-13 11:24:04 -0500 |
---|---|---|
committer | Paul Mackerras <paulus@samba.org> | 2007-12-20 23:05:58 -0500 |
commit | eda09fbdcd8c5afaa81c2f1d28e8b9725bad4d5a (patch) | |
tree | 0afe6e81172fc9508f8e4c13598a276fe3d043c6 /arch/powerpc | |
parent | 1fe58a875e4bb08125c657b1b91ac515d2bdbcbe (diff) |
[POWERPC] Optimize counting distinct entries in the relocation sections
When a module has relocation sections with tens of thousands of
entries, counting the distinct/unique entries only (i.e. no
duplicates) at load time can take tens of seconds and up to minutes.
The sore point is the count_relocs() function which is called as part
of the architecture specific module loading processing path:
-> load_module() generic
-> module_frob_arch_sections() arch specific
-> get_plt_size() 32-bit
-> get_stubs_size() 64-bit
-> count_relocs()
Here count_relocs is being called to find out how many distinct
targets of R_PPC_REL24 relocations there are, since each distinct
target needs a PLT entry or a stub created for it.
The previous counting algorithm has O(n^2) complexity. Basically two
solutions were proposed on the e-mail list: a hash based approach and
a sort based approach.
The hash based approach is the fastest (O(n)) but the has it needs
additional memory and for certain corner cases it could take lots of
memory due to the degeneration of the hash. One such proposal was
submitted here:
http://ozlabs.org/pipermail/linuxppc-dev/2007-June/037641.html
The sort based approach is slower (O(n * log n + n)) but if the
sorting is done "in place" it doesn't need additional memory.
This has O(n + n * log n) complexity with no additional memory
requirements.
This commit implements the in-place sort option.
Signed-off-by: Emil Medve <Emilian.Medve@Freescale.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Diffstat (limited to 'arch/powerpc')
-rw-r--r-- | arch/powerpc/kernel/module_32.c | 77 | ||||
-rw-r--r-- | arch/powerpc/kernel/module_64.c | 81 |
2 files changed, 127 insertions, 31 deletions
diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c index 07a89a398639..eab313858315 100644 --- a/arch/powerpc/kernel/module_32.c +++ b/arch/powerpc/kernel/module_32.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/kernel.h> | 24 | #include <linux/kernel.h> |
25 | #include <linux/cache.h> | 25 | #include <linux/cache.h> |
26 | #include <linux/bug.h> | 26 | #include <linux/bug.h> |
27 | #include <linux/sort.h> | ||
27 | 28 | ||
28 | #include "setup.h" | 29 | #include "setup.h" |
29 | 30 | ||
@@ -54,22 +55,60 @@ void module_free(struct module *mod, void *module_region) | |||
54 | addend) */ | 55 | addend) */ |
55 | static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num) | 56 | static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num) |
56 | { | 57 | { |
57 | unsigned int i, j, ret = 0; | 58 | unsigned int i, r_info, r_addend, _count_relocs; |
58 | 59 | ||
59 | /* Sure, this is order(n^2), but it's usually short, and not | 60 | _count_relocs = 0; |
60 | time critical */ | 61 | r_info = 0; |
61 | for (i = 0; i < num; i++) { | 62 | r_addend = 0; |
62 | for (j = 0; j < i; j++) { | 63 | for (i = 0; i < num; i++) |
63 | /* If this addend appeared before, it's | 64 | /* Only count 24-bit relocs, others don't need stubs */ |
64 | already been counted */ | 65 | if (ELF32_R_TYPE(rela[i].r_info) == R_PPC_REL24 && |
65 | if (ELF32_R_SYM(rela[i].r_info) | 66 | (r_info != ELF32_R_SYM(rela[i].r_info) || |
66 | == ELF32_R_SYM(rela[j].r_info) | 67 | r_addend != rela[i].r_addend)) { |
67 | && rela[i].r_addend == rela[j].r_addend) | 68 | _count_relocs++; |
68 | break; | 69 | r_info = ELF32_R_SYM(rela[i].r_info); |
70 | r_addend = rela[i].r_addend; | ||
69 | } | 71 | } |
70 | if (j == i) ret++; | 72 | |
73 | return _count_relocs; | ||
74 | } | ||
75 | |||
76 | static int relacmp(const void *_x, const void *_y) | ||
77 | { | ||
78 | const Elf32_Rela *x, *y; | ||
79 | |||
80 | y = (Elf32_Rela *)_x; | ||
81 | x = (Elf32_Rela *)_y; | ||
82 | |||
83 | /* Compare the entire r_info (as opposed to ELF32_R_SYM(r_info) only) to | ||
84 | * make the comparison cheaper/faster. It won't affect the sorting or | ||
85 | * the counting algorithms' performance | ||
86 | */ | ||
87 | if (x->r_info < y->r_info) | ||
88 | return -1; | ||
89 | else if (x->r_info > y->r_info) | ||
90 | return 1; | ||
91 | else if (x->r_addend < y->r_addend) | ||
92 | return -1; | ||
93 | else if (x->r_addend > y->r_addend) | ||
94 | return 1; | ||
95 | else | ||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | static void relaswap(void *_x, void *_y, int size) | ||
100 | { | ||
101 | uint32_t *x, *y, tmp; | ||
102 | int i; | ||
103 | |||
104 | y = (uint32_t *)_x; | ||
105 | x = (uint32_t *)_y; | ||
106 | |||
107 | for (i = 0; i < sizeof(Elf32_Rela) / sizeof(uint32_t); i++) { | ||
108 | tmp = x[i]; | ||
109 | x[i] = y[i]; | ||
110 | y[i] = tmp; | ||
71 | } | 111 | } |
72 | return ret; | ||
73 | } | 112 | } |
74 | 113 | ||
75 | /* Get the potential trampolines size required of the init and | 114 | /* Get the potential trampolines size required of the init and |
@@ -100,6 +139,16 @@ static unsigned long get_plt_size(const Elf32_Ehdr *hdr, | |||
100 | DEBUGP("Ptr: %p. Number: %u\n", | 139 | DEBUGP("Ptr: %p. Number: %u\n", |
101 | (void *)hdr + sechdrs[i].sh_offset, | 140 | (void *)hdr + sechdrs[i].sh_offset, |
102 | sechdrs[i].sh_size / sizeof(Elf32_Rela)); | 141 | sechdrs[i].sh_size / sizeof(Elf32_Rela)); |
142 | |||
143 | /* Sort the relocation information based on a symbol and | ||
144 | * addend key. This is a stable O(n*log n) complexity | ||
145 | * alogrithm but it will reduce the complexity of | ||
146 | * count_relocs() to linear complexity O(n) | ||
147 | */ | ||
148 | sort((void *)hdr + sechdrs[i].sh_offset, | ||
149 | sechdrs[i].sh_size / sizeof(Elf32_Rela), | ||
150 | sizeof(Elf32_Rela), relacmp, relaswap); | ||
151 | |||
103 | ret += count_relocs((void *)hdr | 152 | ret += count_relocs((void *)hdr |
104 | + sechdrs[i].sh_offset, | 153 | + sechdrs[i].sh_offset, |
105 | sechdrs[i].sh_size | 154 | sechdrs[i].sh_size |
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 75c7c4f19280..3a82b02b784b 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <asm/module.h> | 24 | #include <asm/module.h> |
25 | #include <asm/uaccess.h> | 25 | #include <asm/uaccess.h> |
26 | #include <asm/firmware.h> | 26 | #include <asm/firmware.h> |
27 | #include <linux/sort.h> | ||
27 | 28 | ||
28 | #include "setup.h" | 29 | #include "setup.h" |
29 | 30 | ||
@@ -81,25 +82,23 @@ static struct ppc64_stub_entry ppc64_stub = | |||
81 | different addend) */ | 82 | different addend) */ |
82 | static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num) | 83 | static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num) |
83 | { | 84 | { |
84 | unsigned int i, j, ret = 0; | 85 | unsigned int i, r_info, r_addend, _count_relocs; |
85 | 86 | ||
86 | /* FIXME: Only count external ones --RR */ | 87 | /* FIXME: Only count external ones --RR */ |
87 | /* Sure, this is order(n^2), but it's usually short, and not | 88 | _count_relocs = 0; |
88 | time critical */ | 89 | r_info = 0; |
89 | for (i = 0; i < num; i++) { | 90 | r_addend = 0; |
91 | for (i = 0; i < num; i++) | ||
90 | /* Only count 24-bit relocs, others don't need stubs */ | 92 | /* Only count 24-bit relocs, others don't need stubs */ |
91 | if (ELF64_R_TYPE(rela[i].r_info) != R_PPC_REL24) | 93 | if (ELF64_R_TYPE(rela[i].r_info) == R_PPC_REL24 && |
92 | continue; | 94 | (r_info != ELF64_R_SYM(rela[i].r_info) || |
93 | for (j = 0; j < i; j++) { | 95 | r_addend != rela[i].r_addend)) { |
94 | /* If this addend appeared before, it's | 96 | _count_relocs++; |
95 | already been counted */ | 97 | r_info = ELF64_R_SYM(rela[i].r_info); |
96 | if (rela[i].r_info == rela[j].r_info | 98 | r_addend = rela[i].r_addend; |
97 | && rela[i].r_addend == rela[j].r_addend) | ||
98 | break; | ||
99 | } | 99 | } |
100 | if (j == i) ret++; | 100 | |
101 | } | 101 | return _count_relocs; |
102 | return ret; | ||
103 | } | 102 | } |
104 | 103 | ||
105 | void *module_alloc(unsigned long size) | 104 | void *module_alloc(unsigned long size) |
@@ -118,6 +117,44 @@ void module_free(struct module *mod, void *module_region) | |||
118 | table entries. */ | 117 | table entries. */ |
119 | } | 118 | } |
120 | 119 | ||
120 | static int relacmp(const void *_x, const void *_y) | ||
121 | { | ||
122 | const Elf64_Rela *x, *y; | ||
123 | |||
124 | y = (Elf64_Rela *)_x; | ||
125 | x = (Elf64_Rela *)_y; | ||
126 | |||
127 | /* Compare the entire r_info (as opposed to ELF64_R_SYM(r_info) only) to | ||
128 | * make the comparison cheaper/faster. It won't affect the sorting or | ||
129 | * the counting algorithms' performance | ||
130 | */ | ||
131 | if (x->r_info < y->r_info) | ||
132 | return -1; | ||
133 | else if (x->r_info > y->r_info) | ||
134 | return 1; | ||
135 | else if (x->r_addend < y->r_addend) | ||
136 | return -1; | ||
137 | else if (x->r_addend > y->r_addend) | ||
138 | return 1; | ||
139 | else | ||
140 | return 0; | ||
141 | } | ||
142 | |||
143 | static void relaswap(void *_x, void *_y, int size) | ||
144 | { | ||
145 | uint64_t *x, *y, tmp; | ||
146 | int i; | ||
147 | |||
148 | y = (uint64_t *)_x; | ||
149 | x = (uint64_t *)_y; | ||
150 | |||
151 | for (i = 0; i < sizeof(Elf64_Rela) / sizeof(uint64_t); i++) { | ||
152 | tmp = x[i]; | ||
153 | x[i] = y[i]; | ||
154 | y[i] = tmp; | ||
155 | } | ||
156 | } | ||
157 | |||
121 | /* Get size of potential trampolines required. */ | 158 | /* Get size of potential trampolines required. */ |
122 | static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, | 159 | static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, |
123 | const Elf64_Shdr *sechdrs) | 160 | const Elf64_Shdr *sechdrs) |
@@ -133,6 +170,16 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, | |||
133 | DEBUGP("Ptr: %p. Number: %lu\n", | 170 | DEBUGP("Ptr: %p. Number: %lu\n", |
134 | (void *)sechdrs[i].sh_addr, | 171 | (void *)sechdrs[i].sh_addr, |
135 | sechdrs[i].sh_size / sizeof(Elf64_Rela)); | 172 | sechdrs[i].sh_size / sizeof(Elf64_Rela)); |
173 | |||
174 | /* Sort the relocation information based on a symbol and | ||
175 | * addend key. This is a stable O(n*log n) complexity | ||
176 | * alogrithm but it will reduce the complexity of | ||
177 | * count_relocs() to linear complexity O(n) | ||
178 | */ | ||
179 | sort((void *)sechdrs[i].sh_addr, | ||
180 | sechdrs[i].sh_size / sizeof(Elf64_Rela), | ||
181 | sizeof(Elf64_Rela), relacmp, relaswap); | ||
182 | |||
136 | relocs += count_relocs((void *)sechdrs[i].sh_addr, | 183 | relocs += count_relocs((void *)sechdrs[i].sh_addr, |
137 | sechdrs[i].sh_size | 184 | sechdrs[i].sh_size |
138 | / sizeof(Elf64_Rela)); | 185 | / sizeof(Elf64_Rela)); |
@@ -343,7 +390,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, | |||
343 | /* Simply set it */ | 390 | /* Simply set it */ |
344 | *(u32 *)location = value; | 391 | *(u32 *)location = value; |
345 | break; | 392 | break; |
346 | 393 | ||
347 | case R_PPC64_ADDR64: | 394 | case R_PPC64_ADDR64: |
348 | /* Simply set it */ | 395 | /* Simply set it */ |
349 | *(unsigned long *)location = value; | 396 | *(unsigned long *)location = value; |
@@ -399,7 +446,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, | |||
399 | } | 446 | } |
400 | 447 | ||
401 | /* Only replace bits 2 through 26 */ | 448 | /* Only replace bits 2 through 26 */ |
402 | *(uint32_t *)location | 449 | *(uint32_t *)location |
403 | = (*(uint32_t *)location & ~0x03fffffc) | 450 | = (*(uint32_t *)location & ~0x03fffffc) |
404 | | (value & 0x03fffffc); | 451 | | (value & 0x03fffffc); |
405 | break; | 452 | break; |