aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc
diff options
context:
space:
mode:
authorEmil Medve <Emilian.Medve@Freescale.com>2007-11-13 11:24:04 -0500
committerPaul Mackerras <paulus@samba.org>2007-12-20 23:05:58 -0500
commiteda09fbdcd8c5afaa81c2f1d28e8b9725bad4d5a (patch)
tree0afe6e81172fc9508f8e4c13598a276fe3d043c6 /arch/powerpc
parent1fe58a875e4bb08125c657b1b91ac515d2bdbcbe (diff)
[POWERPC] Optimize counting distinct entries in the relocation sections
When a module has relocation sections with tens of thousands of entries, counting the distinct/unique entries only (i.e. no duplicates) at load time can take tens of seconds and up to minutes. The sore point is the count_relocs() function which is called as part of the architecture specific module loading processing path: -> load_module() generic -> module_frob_arch_sections() arch specific -> get_plt_size() 32-bit -> get_stubs_size() 64-bit -> count_relocs() Here count_relocs is being called to find out how many distinct targets of R_PPC_REL24 relocations there are, since each distinct target needs a PLT entry or a stub created for it. The previous counting algorithm has O(n^2) complexity. Basically two solutions were proposed on the e-mail list: a hash based approach and a sort based approach. The hash based approach is the fastest (O(n)) but the has it needs additional memory and for certain corner cases it could take lots of memory due to the degeneration of the hash. One such proposal was submitted here: http://ozlabs.org/pipermail/linuxppc-dev/2007-June/037641.html The sort based approach is slower (O(n * log n + n)) but if the sorting is done "in place" it doesn't need additional memory. This has O(n + n * log n) complexity with no additional memory requirements. This commit implements the in-place sort option. Signed-off-by: Emil Medve <Emilian.Medve@Freescale.com> Signed-off-by: Paul Mackerras <paulus@samba.org>
Diffstat (limited to 'arch/powerpc')
-rw-r--r--arch/powerpc/kernel/module_32.c77
-rw-r--r--arch/powerpc/kernel/module_64.c81
2 files changed, 127 insertions, 31 deletions
diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c
index 07a89a398639..eab313858315 100644
--- a/arch/powerpc/kernel/module_32.c
+++ b/arch/powerpc/kernel/module_32.c
@@ -24,6 +24,7 @@
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/cache.h> 25#include <linux/cache.h>
26#include <linux/bug.h> 26#include <linux/bug.h>
27#include <linux/sort.h>
27 28
28#include "setup.h" 29#include "setup.h"
29 30
@@ -54,22 +55,60 @@ void module_free(struct module *mod, void *module_region)
54 addend) */ 55 addend) */
55static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num) 56static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num)
56{ 57{
57 unsigned int i, j, ret = 0; 58 unsigned int i, r_info, r_addend, _count_relocs;
58 59
59 /* Sure, this is order(n^2), but it's usually short, and not 60 _count_relocs = 0;
60 time critical */ 61 r_info = 0;
61 for (i = 0; i < num; i++) { 62 r_addend = 0;
62 for (j = 0; j < i; j++) { 63 for (i = 0; i < num; i++)
63 /* If this addend appeared before, it's 64 /* Only count 24-bit relocs, others don't need stubs */
64 already been counted */ 65 if (ELF32_R_TYPE(rela[i].r_info) == R_PPC_REL24 &&
65 if (ELF32_R_SYM(rela[i].r_info) 66 (r_info != ELF32_R_SYM(rela[i].r_info) ||
66 == ELF32_R_SYM(rela[j].r_info) 67 r_addend != rela[i].r_addend)) {
67 && rela[i].r_addend == rela[j].r_addend) 68 _count_relocs++;
68 break; 69 r_info = ELF32_R_SYM(rela[i].r_info);
70 r_addend = rela[i].r_addend;
69 } 71 }
70 if (j == i) ret++; 72
73 return _count_relocs;
74}
75
76static int relacmp(const void *_x, const void *_y)
77{
78 const Elf32_Rela *x, *y;
79
80 y = (Elf32_Rela *)_x;
81 x = (Elf32_Rela *)_y;
82
83 /* Compare the entire r_info (as opposed to ELF32_R_SYM(r_info) only) to
84 * make the comparison cheaper/faster. It won't affect the sorting or
85 * the counting algorithms' performance
86 */
87 if (x->r_info < y->r_info)
88 return -1;
89 else if (x->r_info > y->r_info)
90 return 1;
91 else if (x->r_addend < y->r_addend)
92 return -1;
93 else if (x->r_addend > y->r_addend)
94 return 1;
95 else
96 return 0;
97}
98
99static void relaswap(void *_x, void *_y, int size)
100{
101 uint32_t *x, *y, tmp;
102 int i;
103
104 y = (uint32_t *)_x;
105 x = (uint32_t *)_y;
106
107 for (i = 0; i < sizeof(Elf32_Rela) / sizeof(uint32_t); i++) {
108 tmp = x[i];
109 x[i] = y[i];
110 y[i] = tmp;
71 } 111 }
72 return ret;
73} 112}
74 113
75/* Get the potential trampolines size required of the init and 114/* Get the potential trampolines size required of the init and
@@ -100,6 +139,16 @@ static unsigned long get_plt_size(const Elf32_Ehdr *hdr,
100 DEBUGP("Ptr: %p. Number: %u\n", 139 DEBUGP("Ptr: %p. Number: %u\n",
101 (void *)hdr + sechdrs[i].sh_offset, 140 (void *)hdr + sechdrs[i].sh_offset,
102 sechdrs[i].sh_size / sizeof(Elf32_Rela)); 141 sechdrs[i].sh_size / sizeof(Elf32_Rela));
142
143 /* Sort the relocation information based on a symbol and
144 * addend key. This is a stable O(n*log n) complexity
145 * alogrithm but it will reduce the complexity of
146 * count_relocs() to linear complexity O(n)
147 */
148 sort((void *)hdr + sechdrs[i].sh_offset,
149 sechdrs[i].sh_size / sizeof(Elf32_Rela),
150 sizeof(Elf32_Rela), relacmp, relaswap);
151
103 ret += count_relocs((void *)hdr 152 ret += count_relocs((void *)hdr
104 + sechdrs[i].sh_offset, 153 + sechdrs[i].sh_offset,
105 sechdrs[i].sh_size 154 sechdrs[i].sh_size
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 75c7c4f19280..3a82b02b784b 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -24,6 +24,7 @@
24#include <asm/module.h> 24#include <asm/module.h>
25#include <asm/uaccess.h> 25#include <asm/uaccess.h>
26#include <asm/firmware.h> 26#include <asm/firmware.h>
27#include <linux/sort.h>
27 28
28#include "setup.h" 29#include "setup.h"
29 30
@@ -81,25 +82,23 @@ static struct ppc64_stub_entry ppc64_stub =
81 different addend) */ 82 different addend) */
82static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num) 83static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num)
83{ 84{
84 unsigned int i, j, ret = 0; 85 unsigned int i, r_info, r_addend, _count_relocs;
85 86
86 /* FIXME: Only count external ones --RR */ 87 /* FIXME: Only count external ones --RR */
87 /* Sure, this is order(n^2), but it's usually short, and not 88 _count_relocs = 0;
88 time critical */ 89 r_info = 0;
89 for (i = 0; i < num; i++) { 90 r_addend = 0;
91 for (i = 0; i < num; i++)
90 /* Only count 24-bit relocs, others don't need stubs */ 92 /* Only count 24-bit relocs, others don't need stubs */
91 if (ELF64_R_TYPE(rela[i].r_info) != R_PPC_REL24) 93 if (ELF64_R_TYPE(rela[i].r_info) == R_PPC_REL24 &&
92 continue; 94 (r_info != ELF64_R_SYM(rela[i].r_info) ||
93 for (j = 0; j < i; j++) { 95 r_addend != rela[i].r_addend)) {
94 /* If this addend appeared before, it's 96 _count_relocs++;
95 already been counted */ 97 r_info = ELF64_R_SYM(rela[i].r_info);
96 if (rela[i].r_info == rela[j].r_info 98 r_addend = rela[i].r_addend;
97 && rela[i].r_addend == rela[j].r_addend)
98 break;
99 } 99 }
100 if (j == i) ret++; 100
101 } 101 return _count_relocs;
102 return ret;
103} 102}
104 103
105void *module_alloc(unsigned long size) 104void *module_alloc(unsigned long size)
@@ -118,6 +117,44 @@ void module_free(struct module *mod, void *module_region)
118 table entries. */ 117 table entries. */
119} 118}
120 119
120static int relacmp(const void *_x, const void *_y)
121{
122 const Elf64_Rela *x, *y;
123
124 y = (Elf64_Rela *)_x;
125 x = (Elf64_Rela *)_y;
126
127 /* Compare the entire r_info (as opposed to ELF64_R_SYM(r_info) only) to
128 * make the comparison cheaper/faster. It won't affect the sorting or
129 * the counting algorithms' performance
130 */
131 if (x->r_info < y->r_info)
132 return -1;
133 else if (x->r_info > y->r_info)
134 return 1;
135 else if (x->r_addend < y->r_addend)
136 return -1;
137 else if (x->r_addend > y->r_addend)
138 return 1;
139 else
140 return 0;
141}
142
143static void relaswap(void *_x, void *_y, int size)
144{
145 uint64_t *x, *y, tmp;
146 int i;
147
148 y = (uint64_t *)_x;
149 x = (uint64_t *)_y;
150
151 for (i = 0; i < sizeof(Elf64_Rela) / sizeof(uint64_t); i++) {
152 tmp = x[i];
153 x[i] = y[i];
154 y[i] = tmp;
155 }
156}
157
121/* Get size of potential trampolines required. */ 158/* Get size of potential trampolines required. */
122static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, 159static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
123 const Elf64_Shdr *sechdrs) 160 const Elf64_Shdr *sechdrs)
@@ -133,6 +170,16 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
133 DEBUGP("Ptr: %p. Number: %lu\n", 170 DEBUGP("Ptr: %p. Number: %lu\n",
134 (void *)sechdrs[i].sh_addr, 171 (void *)sechdrs[i].sh_addr,
135 sechdrs[i].sh_size / sizeof(Elf64_Rela)); 172 sechdrs[i].sh_size / sizeof(Elf64_Rela));
173
174 /* Sort the relocation information based on a symbol and
175 * addend key. This is a stable O(n*log n) complexity
176 * alogrithm but it will reduce the complexity of
177 * count_relocs() to linear complexity O(n)
178 */
179 sort((void *)sechdrs[i].sh_addr,
180 sechdrs[i].sh_size / sizeof(Elf64_Rela),
181 sizeof(Elf64_Rela), relacmp, relaswap);
182
136 relocs += count_relocs((void *)sechdrs[i].sh_addr, 183 relocs += count_relocs((void *)sechdrs[i].sh_addr,
137 sechdrs[i].sh_size 184 sechdrs[i].sh_size
138 / sizeof(Elf64_Rela)); 185 / sizeof(Elf64_Rela));
@@ -343,7 +390,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
343 /* Simply set it */ 390 /* Simply set it */
344 *(u32 *)location = value; 391 *(u32 *)location = value;
345 break; 392 break;
346 393
347 case R_PPC64_ADDR64: 394 case R_PPC64_ADDR64:
348 /* Simply set it */ 395 /* Simply set it */
349 *(unsigned long *)location = value; 396 *(unsigned long *)location = value;
@@ -399,7 +446,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
399 } 446 }
400 447
401 /* Only replace bits 2 through 26 */ 448 /* Only replace bits 2 through 26 */
402 *(uint32_t *)location 449 *(uint32_t *)location
403 = (*(uint32_t *)location & ~0x03fffffc) 450 = (*(uint32_t *)location & ~0x03fffffc)
404 | (value & 0x03fffffc); 451 | (value & 0x03fffffc);
405 break; 452 break;