[POWERPC] Optimize counting distinct entries in the relocation sections

When a module has relocation sections with tens of thousands of entries, counting the distinct/unique entries only (i.e. no duplicates) at load time can take tens of seconds and up to minutes. The sore point is the count_relocs() function which is called as part of the architecture specific module loading processing path: -> load_module() generic -> module_frob_arch_sections() arch specific -> get_plt_size() 32-bit -> get_stubs_size() 64-bit -> count_relocs() Here count_relocs is being called to find out how many distinct targets of R_PPC_REL24 relocations there are, since each distinct target needs a PLT entry or a stub created for it. The previous counting algorithm has O(n^2) complexity. Basically two solutions were proposed on the e-mail list: a hash based approach and a sort based approach. The hash based approach is the fastest (O(n)) but the has it needs additional memory and for certain corner cases it could take lots of memory due to the degeneration of the hash. One such proposal was submitted here: http://ozlabs.org/pipermail/linuxppc-dev/2007-June/037641.html The sort based approach is slower (O(n * log n + n)) but if the sorting is done "in place" it doesn't need additional memory. This has O(n + n * log n) complexity with no additional memory requirements. This commit implements the in-place sort option. Signed-off-by: Emil Medve <Emilian.Medve@Freescale.com> Signed-off-by: Paul Mackerras <paulus@samba.org>
author: Emil Medve <Emilian.Medve@Freescale.com> 2007-11-13 11:24:04 -0500
committer: Paul Mackerras <paulus@samba.org> 2007-12-20 23:05:58 -0500
commit: eda09fbdcd8c5afaa81c2f1d28e8b9725bad4d5a (patch)
tree: 0afe6e81172fc9508f8e4c13598a276fe3d043c6 /arch
parent: 1fe58a875e4bb08125c657b1b91ac515d2bdbcbe (diff)
2 files changed, 127 insertions, 31 deletions
diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c
index 07a89a398639..eab313858315 100644
--- a/arch/powerpc/kernel/module_32.c
+++ b/arch/powerpc/kernel/module_32.c
@@ -24,6 +24,7 @@
 #include <linux/kernel.h>
 #include <linux/cache.h>
 #include <linux/bug.h>
+#include <linux/sort.h>
 #include "setup.h"
@@ -54,22 +55,60 @@ void module_free(struct module *mod, void *module_region)
   addend) */
 static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num)
 {
-        unsigned int i, j, ret = 0;
+        unsigned int i, r_info, r_addend, _count_relocs;
-        /* Sure, this is order(n^2), but it's usually short, and not
+        _count_relocs = 0;
-           time critical */
+        r_info = 0;
-        for (i = 0; i < num; i++) {
+        r_addend = 0;
-                for (j = 0; j < i; j++) {
+        for (i = 0; i < num; i++)
-                        /* If this addend appeared before, it's
+                /* Only count 24-bit relocs, others don't need stubs */
-                           already been counted */
+                if (ELF32_R_TYPE(rela[i].r_info) == R_PPC_REL24 &&
-                        if (ELF32_R_SYM(rela[i].r_info)
+                    (r_info != ELF32_R_SYM(rela[i].r_info) ||
-                            == ELF32_R_SYM(rela[j].r_info)
+                     r_addend != rela[i].r_addend)) {
-                            && rela[i].r_addend == rela[j].r_addend)
+                        _count_relocs++;
-                                break;
+                        r_info = ELF32_R_SYM(rela[i].r_info);
+                        r_addend = rela[i].r_addend;
                }
-                if (j == i) ret++;
+        return _count_relocs;
+}
+static int relacmp(const void *_x, const void *_y)
+{
+        const Elf32_Rela *x, *y;
+        y = (Elf32_Rela *)_x;
+        x = (Elf32_Rela *)_y;
+        /* Compare the entire r_info (as opposed to ELF32_R_SYM(r_info) only) to
+         * make the comparison cheaper/faster. It won't affect the sorting or
+         * the counting algorithms' performance
+         */
+        if (x->r_info < y->r_info)
+                return -1;
+        else if (x->r_info > y->r_info)
+                return 1;
+        else if (x->r_addend < y->r_addend)
+                return -1;
+        else if (x->r_addend > y->r_addend)
+                return 1;
+        else
+                return 0;
+}
+static void relaswap(void *_x, void *_y, int size)
+{
+        uint32_t *x, *y, tmp;
+        int i;
+        y = (uint32_t *)_x;
+        x = (uint32_t *)_y;
+        for (i = 0; i < sizeof(Elf32_Rela) / sizeof(uint32_t); i++) {
+                tmp = x[i];
+                x[i] = y[i];
+                y[i] = tmp;
        }
-        return ret;
 }
 /* Get the potential trampolines size required of the init and
@@ -100,6 +139,16 @@ static unsigned long get_plt_size(const Elf32_Ehdr *hdr,
                        DEBUGP("Ptr: %p.  Number: %u\n",
                               (void *)hdr + sechdrs[i].sh_offset,
                               sechdrs[i].sh_size / sizeof(Elf32_Rela));
+                        /* Sort the relocation information based on a symbol and
+                         * addend key. This is a stable O(n*log n) complexity
+                         * alogrithm but it will reduce the complexity of
+                         * count_relocs() to linear complexity O(n)
+                         */
+                        sort((void *)hdr + sechdrs[i].sh_offset,
+                             sechdrs[i].sh_size / sizeof(Elf32_Rela),
+                             sizeof(Elf32_Rela), relacmp, relaswap);
                        ret += count_relocs((void *)hdr
                                             + sechdrs[i].sh_offset,
                                             sechdrs[i].sh_size
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 75c7c4f19280..3a82b02b784b 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -24,6 +24,7 @@
 #include <asm/module.h>
 #include <asm/uaccess.h>
 #include <asm/firmware.h>
+#include <linux/sort.h>
 #include "setup.h"
@@ -81,25 +82,23 @@ static struct ppc64_stub_entry ppc64_stub =
   different addend) */
 static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num)
 {
-        unsigned int i, j, ret = 0;
+        unsigned int i, r_info, r_addend, _count_relocs;
        /* FIXME: Only count external ones --RR */
-        /* Sure, this is order(n^2), but it's usually short, and not
+        _count_relocs = 0;
-           time critical */
+        r_info = 0;
-        for (i = 0; i < num; i++) {
+        r_addend = 0;
+        for (i = 0; i < num; i++)
                /* Only count 24-bit relocs, others don't need stubs */
-                if (ELF64_R_TYPE(rela[i].r_info) != R_PPC_REL24)
+                if (ELF64_R_TYPE(rela[i].r_info) == R_PPC_REL24 &&
-                        continue;
+                    (r_info != ELF64_R_SYM(rela[i].r_info) ||
-                for (j = 0; j < i; j++) {
+                     r_addend != rela[i].r_addend)) {
-                        /* If this addend appeared before, it's
+                        _count_relocs++;
-                           already been counted */
+                        r_info = ELF64_R_SYM(rela[i].r_info);
-                        if (rela[i].r_info == rela[j].r_info
+                        r_addend = rela[i].r_addend;
-                            && rela[i].r_addend == rela[j].r_addend)
-                                break;
                }
-                if (j == i) ret++;
-        }
+        return _count_relocs;
-        return ret;
 }
 void *module_alloc(unsigned long size)
@@ -118,6 +117,44 @@ void module_free(struct module *mod, void *module_region)
           table entries. */
 }
+static int relacmp(const void *_x, const void *_y)
+{
+        const Elf64_Rela *x, *y;
+        y = (Elf64_Rela *)_x;
+        x = (Elf64_Rela *)_y;
+        /* Compare the entire r_info (as opposed to ELF64_R_SYM(r_info) only) to
+         * make the comparison cheaper/faster. It won't affect the sorting or
+         * the counting algorithms' performance
+         */
+        if (x->r_info < y->r_info)
+                return -1;
+        else if (x->r_info > y->r_info)
+                return 1;
+        else if (x->r_addend < y->r_addend)
+                return -1;
+        else if (x->r_addend > y->r_addend)
+                return 1;
+        else
+                return 0;
+}
+static void relaswap(void *_x, void *_y, int size)
+{
+        uint64_t *x, *y, tmp;
+        int i;
+        y = (uint64_t *)_x;
+        x = (uint64_t *)_y;
+        for (i = 0; i < sizeof(Elf64_Rela) / sizeof(uint64_t); i++) {
+                tmp = x[i];
+                x[i] = y[i];
+                y[i] = tmp;
+        }
+}
 /* Get size of potential trampolines required. */
 static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
                                    const Elf64_Shdr *sechdrs)
@@ -133,6 +170,16 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
                        DEBUGP("Ptr: %p.  Number: %lu\n",
                               (void *)sechdrs[i].sh_addr,
                               sechdrs[i].sh_size / sizeof(Elf64_Rela));
+                        /* Sort the relocation information based on a symbol and
+                         * addend key. This is a stable O(n*log n) complexity
+                         * alogrithm but it will reduce the complexity of
+                         * count_relocs() to linear complexity O(n)
+                         */
+                        sort((void *)sechdrs[i].sh_addr,
+                             sechdrs[i].sh_size / sizeof(Elf64_Rela),
+                             sizeof(Elf64_Rela), relacmp, relaswap);
                        relocs += count_relocs((void *)sechdrs[i].sh_addr,
                                               sechdrs[i].sh_size
                                               / sizeof(Elf64_Rela));
@@ -343,7 +390,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
                        /* Simply set it */
                        *(u32 *)location = value;
                        break;
-                        
                case R_PPC64_ADDR64:
                        /* Simply set it */
                        *(unsigned long *)location = value;
@@ -399,7 +446,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
                        }
                        /* Only replace bits 2 through 26 */
-                        *(uint32_t *)location 
+                        *(uint32_t *)location
                                = (*(uint32_t *)location & ~0x03fffffc)
                                | (value & 0x03fffffc);
                        break;
author	Emil Medve <Emilian.Medve@Freescale.com>	2007-11-13 11:24:04 -0500
committer	Paul Mackerras <paulus@samba.org>	2007-12-20 23:05:58 -0500
commit	eda09fbdcd8c5afaa81c2f1d28e8b9725bad4d5a (patch)
tree	0afe6e81172fc9508f8e4c13598a276fe3d043c6 /arch
parent	1fe58a875e4bb08125c657b1b91ac515d2bdbcbe (diff)

diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c index 07a89a398639..eab313858315 100644 --- a/arch/powerpc/kernel/module_32.c +++ b/arch/powerpc/kernel/module_32.c
@@ -24,6 +24,7 @@
24	#include <linux/kernel.h>	24	#include <linux/kernel.h>
25	#include <linux/cache.h>	25	#include <linux/cache.h>
26	#include <linux/bug.h>	26	#include <linux/bug.h>
		27	#include <linux/sort.h>
27		28
28	#include "setup.h"	29	#include "setup.h"
29		30
@@ -54,22 +55,60 @@ void module_free(struct module mod, void module_region)
54	addend) */	55	addend) */
55	static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num)	56	static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num)
56	{	57	{
57	unsigned int i, j, ret = 0;	58	unsigned int i, r_info, r_addend, _count_relocs;
58		59
59	/* Sure, this is order(n^2), but it's usually short, and not	60	_count_relocs = 0;
60	time critical */	61	r_info = 0;
61	for (i = 0; i < num; i++) {	62	r_addend = 0;
62	for (j = 0; j < i; j++) {	63	for (i = 0; i < num; i++)
63	/* If this addend appeared before, it's	64	/* Only count 24-bit relocs, others don't need stubs */
64	already been counted */	65	if (ELF32_R_TYPE(rela[i].r_info) == R_PPC_REL24 &&
65	if (ELF32_R_SYM(rela[i].r_info)	66	(r_info != ELF32_R_SYM(rela[i].r_info) \|\|
66	== ELF32_R_SYM(rela[j].r_info)	67	r_addend != rela[i].r_addend)) {
67	&& rela[i].r_addend == rela[j].r_addend)	68	_count_relocs++;
68	break;	69	r_info = ELF32_R_SYM(rela[i].r_info);
		70	r_addend = rela[i].r_addend;
69	}	71	}
70	if (j == i) ret++;	72
		73	return _count_relocs;
		74	}
		75
		76	static int relacmp(const void _x, const void _y)
		77	{
		78	const Elf32_Rela x, y;
		79
		80	y = (Elf32_Rela *)_x;
		81	x = (Elf32_Rela *)_y;
		82
		83	/* Compare the entire r_info (as opposed to ELF32_R_SYM(r_info) only) to
		84	* make the comparison cheaper/faster. It won't affect the sorting or
		85	* the counting algorithms' performance
		86	*/
		87	if (x->r_info < y->r_info)
		88	return -1;
		89	else if (x->r_info > y->r_info)
		90	return 1;
		91	else if (x->r_addend < y->r_addend)
		92	return -1;
		93	else if (x->r_addend > y->r_addend)
		94	return 1;
		95	else
		96	return 0;
		97	}
		98
		99	static void relaswap(void _x, void _y, int size)
		100	{
		101	uint32_t x, y, tmp;
		102	int i;
		103
		104	y = (uint32_t *)_x;
		105	x = (uint32_t *)_y;
		106
		107	for (i = 0; i < sizeof(Elf32_Rela) / sizeof(uint32_t); i++) {
		108	tmp = x[i];
		109	x[i] = y[i];
		110	y[i] = tmp;
71	}	111	}
72	return ret;
73	}	112	}
74		113
75	/* Get the potential trampolines size required of the init and	114	/* Get the potential trampolines size required of the init and
@@ -100,6 +139,16 @@ static unsigned long get_plt_size(const Elf32_Ehdr *hdr,
100	DEBUGP("Ptr: %p. Number: %u\n",	139	DEBUGP("Ptr: %p. Number: %u\n",
101	(void *)hdr + sechdrs[i].sh_offset,	140	(void *)hdr + sechdrs[i].sh_offset,
102	sechdrs[i].sh_size / sizeof(Elf32_Rela));	141	sechdrs[i].sh_size / sizeof(Elf32_Rela));
		142
		143	/* Sort the relocation information based on a symbol and
		144	* addend key. This is a stable O(n*log n) complexity
		145	* alogrithm but it will reduce the complexity of
		146	* count_relocs() to linear complexity O(n)
		147	*/
		148	sort((void *)hdr + sechdrs[i].sh_offset,
		149	sechdrs[i].sh_size / sizeof(Elf32_Rela),
		150	sizeof(Elf32_Rela), relacmp, relaswap);
		151
103	ret += count_relocs((void *)hdr	152	ret += count_relocs((void *)hdr
104	+ sechdrs[i].sh_offset,	153	+ sechdrs[i].sh_offset,
105	sechdrs[i].sh_size	154	sechdrs[i].sh_size


diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 75c7c4f19280..3a82b02b784b 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c
@@ -24,6 +24,7 @@
24	#include <asm/module.h>	24	#include <asm/module.h>
25	#include <asm/uaccess.h>	25	#include <asm/uaccess.h>
26	#include <asm/firmware.h>	26	#include <asm/firmware.h>
		27	#include <linux/sort.h>
27		28
28	#include "setup.h"	29	#include "setup.h"
29		30
@@ -81,25 +82,23 @@ static struct ppc64_stub_entry ppc64_stub =
81	different addend) */	82	different addend) */
82	static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num)	83	static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num)
83	{	84	{
84	unsigned int i, j, ret = 0;	85	unsigned int i, r_info, r_addend, _count_relocs;
85		86
86	/* FIXME: Only count external ones --RR */	87	/* FIXME: Only count external ones --RR */
87	/* Sure, this is order(n^2), but it's usually short, and not	88	_count_relocs = 0;
88	time critical */	89	r_info = 0;
89	for (i = 0; i < num; i++) {	90	r_addend = 0;
		91	for (i = 0; i < num; i++)
90	/* Only count 24-bit relocs, others don't need stubs */	92	/* Only count 24-bit relocs, others don't need stubs */
91	if (ELF64_R_TYPE(rela[i].r_info) != R_PPC_REL24)	93	if (ELF64_R_TYPE(rela[i].r_info) == R_PPC_REL24 &&
92	continue;	94	(r_info != ELF64_R_SYM(rela[i].r_info) \|\|
93	for (j = 0; j < i; j++) {	95	r_addend != rela[i].r_addend)) {
94	/* If this addend appeared before, it's	96	_count_relocs++;
95	already been counted */	97	r_info = ELF64_R_SYM(rela[i].r_info);
96	if (rela[i].r_info == rela[j].r_info	98	r_addend = rela[i].r_addend;
97	&& rela[i].r_addend == rela[j].r_addend)
98	break;
99	}	99	}
100	if (j == i) ret++;	100
101	}	101	return _count_relocs;
102	return ret;
103	}	102	}
104		103
105	void *module_alloc(unsigned long size)	104	void *module_alloc(unsigned long size)
@@ -118,6 +117,44 @@ void module_free(struct module mod, void module_region)
118	table entries. */	117	table entries. */
119	}	118	}
120		119
		120	static int relacmp(const void _x, const void _y)
		121	{
		122	const Elf64_Rela x, y;
		123
		124	y = (Elf64_Rela *)_x;
		125	x = (Elf64_Rela *)_y;
		126
		127	/* Compare the entire r_info (as opposed to ELF64_R_SYM(r_info) only) to
		128	* make the comparison cheaper/faster. It won't affect the sorting or
		129	* the counting algorithms' performance
		130	*/
		131	if (x->r_info < y->r_info)
		132	return -1;
		133	else if (x->r_info > y->r_info)
		134	return 1;
		135	else if (x->r_addend < y->r_addend)
		136	return -1;
		137	else if (x->r_addend > y->r_addend)
		138	return 1;
		139	else
		140	return 0;
		141	}
		142
		143	static void relaswap(void _x, void _y, int size)
		144	{
		145	uint64_t x, y, tmp;
		146	int i;
		147
		148	y = (uint64_t *)_x;
		149	x = (uint64_t *)_y;
		150
		151	for (i = 0; i < sizeof(Elf64_Rela) / sizeof(uint64_t); i++) {
		152	tmp = x[i];
		153	x[i] = y[i];
		154	y[i] = tmp;
		155	}
		156	}
		157
121	/* Get size of potential trampolines required. */	158	/* Get size of potential trampolines required. */
122	static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,	159	static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
123	const Elf64_Shdr *sechdrs)	160	const Elf64_Shdr *sechdrs)
@@ -133,6 +170,16 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
133	DEBUGP("Ptr: %p. Number: %lu\n",	170	DEBUGP("Ptr: %p. Number: %lu\n",
134	(void *)sechdrs[i].sh_addr,	171	(void *)sechdrs[i].sh_addr,
135	sechdrs[i].sh_size / sizeof(Elf64_Rela));	172	sechdrs[i].sh_size / sizeof(Elf64_Rela));
		173
		174	/* Sort the relocation information based on a symbol and
		175	* addend key. This is a stable O(n*log n) complexity
		176	* alogrithm but it will reduce the complexity of
		177	* count_relocs() to linear complexity O(n)
		178	*/
		179	sort((void *)sechdrs[i].sh_addr,
		180	sechdrs[i].sh_size / sizeof(Elf64_Rela),
		181	sizeof(Elf64_Rela), relacmp, relaswap);
		182
136	relocs += count_relocs((void *)sechdrs[i].sh_addr,	183	relocs += count_relocs((void *)sechdrs[i].sh_addr,
137	sechdrs[i].sh_size	184	sechdrs[i].sh_size
138	/ sizeof(Elf64_Rela));	185	/ sizeof(Elf64_Rela));
@@ -343,7 +390,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
343	/* Simply set it */	390	/* Simply set it */
344	(u32 )location = value;	391	(u32 )location = value;
345	break;	392	break;
346		393
347	case R_PPC64_ADDR64:	394	case R_PPC64_ADDR64:
348	/* Simply set it */	395	/* Simply set it */
349	(unsigned long )location = value;	396	(unsigned long )location = value;
@@ -399,7 +446,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
399	}	446	}
400		447
401	/* Only replace bits 2 through 26 */	448	/* Only replace bits 2 through 26 */
402	(uint32_t )location	449	(uint32_t )location
403	= ((uint32_t )location & ~0x03fffffc)	450	= ((uint32_t )location & ~0x03fffffc)
404	\| (value & 0x03fffffc);	451	\| (value & 0x03fffffc);
405	break;	452	break;