diff options
63 files changed, 2306 insertions, 1796 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a42660c7356a..159c2ff9c127 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -1709,7 +1709,7 @@ config HAVE_ARCH_EARLY_PFN_TO_NID | |||
1709 | depends on NUMA | 1709 | depends on NUMA |
1710 | 1710 | ||
1711 | config USE_PERCPU_NUMA_NODE_ID | 1711 | config USE_PERCPU_NUMA_NODE_ID |
1712 | def_bool X86_64 | 1712 | def_bool y |
1713 | depends on NUMA | 1713 | depends on NUMA |
1714 | 1714 | ||
1715 | menu "Power management and ACPI options" | 1715 | menu "Power management and ACPI options" |
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 4ea15ca89b2b..b964ec457546 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h | |||
@@ -186,15 +186,7 @@ struct bootnode; | |||
186 | 186 | ||
187 | #ifdef CONFIG_ACPI_NUMA | 187 | #ifdef CONFIG_ACPI_NUMA |
188 | extern int acpi_numa; | 188 | extern int acpi_numa; |
189 | extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start, | 189 | extern int x86_acpi_numa_init(void); |
190 | unsigned long end); | ||
191 | extern int acpi_scan_nodes(unsigned long start, unsigned long end); | ||
192 | #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) | ||
193 | |||
194 | #ifdef CONFIG_NUMA_EMU | ||
195 | extern void acpi_fake_nodes(const struct bootnode *fake_nodes, | ||
196 | int num_nodes); | ||
197 | #endif | ||
198 | #endif /* CONFIG_ACPI_NUMA */ | 190 | #endif /* CONFIG_ACPI_NUMA */ |
199 | 191 | ||
200 | #define acpi_unlazy_tlb(x) leave_mm(x) | 192 | #define acpi_unlazy_tlb(x) leave_mm(x) |
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 64dc82ee19f0..e264ae5a1443 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h | |||
@@ -9,23 +9,20 @@ struct amd_nb_bus_dev_range { | |||
9 | u8 dev_limit; | 9 | u8 dev_limit; |
10 | }; | 10 | }; |
11 | 11 | ||
12 | extern struct pci_device_id amd_nb_misc_ids[]; | 12 | extern const struct pci_device_id amd_nb_misc_ids[]; |
13 | extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; | 13 | extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; |
14 | struct bootnode; | 14 | struct bootnode; |
15 | 15 | ||
16 | extern int early_is_amd_nb(u32 value); | 16 | extern int early_is_amd_nb(u32 value); |
17 | extern int amd_cache_northbridges(void); | 17 | extern int amd_cache_northbridges(void); |
18 | extern void amd_flush_garts(void); | 18 | extern void amd_flush_garts(void); |
19 | extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); | 19 | extern int amd_numa_init(void); |
20 | extern int amd_scan_nodes(void); | 20 | extern int amd_get_subcaches(int); |
21 | 21 | extern int amd_set_subcaches(int, int); | |
22 | #ifdef CONFIG_NUMA_EMU | ||
23 | extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes); | ||
24 | extern void amd_get_nodes(struct bootnode *nodes); | ||
25 | #endif | ||
26 | 22 | ||
27 | struct amd_northbridge { | 23 | struct amd_northbridge { |
28 | struct pci_dev *misc; | 24 | struct pci_dev *misc; |
25 | struct pci_dev *link; | ||
29 | }; | 26 | }; |
30 | 27 | ||
31 | struct amd_northbridge_info { | 28 | struct amd_northbridge_info { |
@@ -37,6 +34,7 @@ extern struct amd_northbridge_info amd_northbridges; | |||
37 | 34 | ||
38 | #define AMD_NB_GART 0x1 | 35 | #define AMD_NB_GART 0x1 |
39 | #define AMD_NB_L3_INDEX_DISABLE 0x2 | 36 | #define AMD_NB_L3_INDEX_DISABLE 0x2 |
37 | #define AMD_NB_L3_PARTITIONING 0x4 | ||
40 | 38 | ||
41 | #ifdef CONFIG_AMD_NB | 39 | #ifdef CONFIG_AMD_NB |
42 | 40 | ||
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3c896946f4cc..b8a3484d69e9 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h | |||
@@ -307,8 +307,6 @@ struct apic { | |||
307 | 307 | ||
308 | void (*setup_apic_routing)(void); | 308 | void (*setup_apic_routing)(void); |
309 | int (*multi_timer_check)(int apic, int irq); | 309 | int (*multi_timer_check)(int apic, int irq); |
310 | int (*apicid_to_node)(int logical_apicid); | ||
311 | int (*cpu_to_logical_apicid)(int cpu); | ||
312 | int (*cpu_present_to_apicid)(int mps_cpu); | 310 | int (*cpu_present_to_apicid)(int mps_cpu); |
313 | void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); | 311 | void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); |
314 | void (*setup_portio_remap)(void); | 312 | void (*setup_portio_remap)(void); |
@@ -356,6 +354,23 @@ struct apic { | |||
356 | void (*icr_write)(u32 low, u32 high); | 354 | void (*icr_write)(u32 low, u32 high); |
357 | void (*wait_icr_idle)(void); | 355 | void (*wait_icr_idle)(void); |
358 | u32 (*safe_wait_icr_idle)(void); | 356 | u32 (*safe_wait_icr_idle)(void); |
357 | |||
358 | #ifdef CONFIG_X86_32 | ||
359 | /* | ||
360 | * Called very early during boot from get_smp_config(). It should | ||
361 | * return the logical apicid. x86_[bios]_cpu_to_apicid is | ||
362 | * initialized before this function is called. | ||
363 | * | ||
364 | * If logical apicid can't be determined that early, the function | ||
365 | * may return BAD_APICID. Logical apicid will be configured after | ||
366 | * init_apic_ldr() while bringing up CPUs. Note that NUMA affinity | ||
367 | * won't be applied properly during early boot in this case. | ||
368 | */ | ||
369 | int (*x86_32_early_logical_apicid)(int cpu); | ||
370 | |||
371 | /* determine CPU -> NUMA node mapping */ | ||
372 | int (*x86_32_numa_cpu_node)(int cpu); | ||
373 | #endif | ||
359 | }; | 374 | }; |
360 | 375 | ||
361 | /* | 376 | /* |
@@ -503,6 +518,11 @@ extern struct apic apic_noop; | |||
503 | 518 | ||
504 | extern struct apic apic_default; | 519 | extern struct apic apic_default; |
505 | 520 | ||
521 | static inline int noop_x86_32_early_logical_apicid(int cpu) | ||
522 | { | ||
523 | return BAD_APICID; | ||
524 | } | ||
525 | |||
506 | /* | 526 | /* |
507 | * Set up the logical destination ID. | 527 | * Set up the logical destination ID. |
508 | * | 528 | * |
@@ -522,7 +542,7 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) | |||
522 | return cpuid_apic >> index_msb; | 542 | return cpuid_apic >> index_msb; |
523 | } | 543 | } |
524 | 544 | ||
525 | extern int default_apicid_to_node(int logical_apicid); | 545 | extern int default_x86_32_numa_cpu_node(int cpu); |
526 | 546 | ||
527 | #endif | 547 | #endif |
528 | 548 | ||
@@ -558,12 +578,6 @@ static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_ma | |||
558 | *retmap = *phys_map; | 578 | *retmap = *phys_map; |
559 | } | 579 | } |
560 | 580 | ||
561 | /* Mapping from cpu number to logical apicid */ | ||
562 | static inline int default_cpu_to_logical_apicid(int cpu) | ||
563 | { | ||
564 | return 1 << cpu; | ||
565 | } | ||
566 | |||
567 | static inline int __default_cpu_present_to_apicid(int mps_cpu) | 581 | static inline int __default_cpu_present_to_apicid(int mps_cpu) |
568 | { | 582 | { |
569 | if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) | 583 | if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) |
@@ -596,8 +610,4 @@ extern int default_check_phys_apicid_present(int phys_apicid); | |||
596 | 610 | ||
597 | #endif /* CONFIG_X86_LOCAL_APIC */ | 611 | #endif /* CONFIG_X86_LOCAL_APIC */ |
598 | 612 | ||
599 | #ifdef CONFIG_X86_32 | ||
600 | extern u8 cpu_2_logical_apicid[NR_CPUS]; | ||
601 | #endif | ||
602 | |||
603 | #endif /* _ASM_X86_APIC_H */ | 613 | #endif /* _ASM_X86_APIC_H */ |
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 57650ab4a5f5..1cd6d26a0a8d 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h | |||
@@ -16,10 +16,13 @@ BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) | |||
16 | BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) | 16 | BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) |
17 | BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) | 17 | BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) |
18 | 18 | ||
19 | .irpc idx, "01234567" | 19 | .irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ |
20 | 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 | ||
21 | .if NUM_INVALIDATE_TLB_VECTORS > \idx | ||
20 | BUILD_INTERRUPT3(invalidate_interrupt\idx, | 22 | BUILD_INTERRUPT3(invalidate_interrupt\idx, |
21 | (INVALIDATE_TLB_VECTOR_START)+\idx, | 23 | (INVALIDATE_TLB_VECTOR_START)+\idx, |
22 | smp_invalidate_interrupt) | 24 | smp_invalidate_interrupt) |
25 | .endif | ||
23 | .endr | 26 | .endr |
24 | #endif | 27 | #endif |
25 | 28 | ||
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 0274ec5a7e62..bb9efe8706e2 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h | |||
@@ -45,6 +45,30 @@ extern void invalidate_interrupt4(void); | |||
45 | extern void invalidate_interrupt5(void); | 45 | extern void invalidate_interrupt5(void); |
46 | extern void invalidate_interrupt6(void); | 46 | extern void invalidate_interrupt6(void); |
47 | extern void invalidate_interrupt7(void); | 47 | extern void invalidate_interrupt7(void); |
48 | extern void invalidate_interrupt8(void); | ||
49 | extern void invalidate_interrupt9(void); | ||
50 | extern void invalidate_interrupt10(void); | ||
51 | extern void invalidate_interrupt11(void); | ||
52 | extern void invalidate_interrupt12(void); | ||
53 | extern void invalidate_interrupt13(void); | ||
54 | extern void invalidate_interrupt14(void); | ||
55 | extern void invalidate_interrupt15(void); | ||
56 | extern void invalidate_interrupt16(void); | ||
57 | extern void invalidate_interrupt17(void); | ||
58 | extern void invalidate_interrupt18(void); | ||
59 | extern void invalidate_interrupt19(void); | ||
60 | extern void invalidate_interrupt20(void); | ||
61 | extern void invalidate_interrupt21(void); | ||
62 | extern void invalidate_interrupt22(void); | ||
63 | extern void invalidate_interrupt23(void); | ||
64 | extern void invalidate_interrupt24(void); | ||
65 | extern void invalidate_interrupt25(void); | ||
66 | extern void invalidate_interrupt26(void); | ||
67 | extern void invalidate_interrupt27(void); | ||
68 | extern void invalidate_interrupt28(void); | ||
69 | extern void invalidate_interrupt29(void); | ||
70 | extern void invalidate_interrupt30(void); | ||
71 | extern void invalidate_interrupt31(void); | ||
48 | 72 | ||
49 | extern void irq_move_cleanup_interrupt(void); | 73 | extern void irq_move_cleanup_interrupt(void); |
50 | extern void reboot_interrupt(void); | 74 | extern void reboot_interrupt(void); |
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 36fb1a6a5109..8dbe353e41e1 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h | |||
@@ -11,8 +11,8 @@ kernel_physical_mapping_init(unsigned long start, | |||
11 | unsigned long page_size_mask); | 11 | unsigned long page_size_mask); |
12 | 12 | ||
13 | 13 | ||
14 | extern unsigned long __initdata e820_table_start; | 14 | extern unsigned long __initdata pgt_buf_start; |
15 | extern unsigned long __meminitdata e820_table_end; | 15 | extern unsigned long __meminitdata pgt_buf_end; |
16 | extern unsigned long __meminitdata e820_table_top; | 16 | extern unsigned long __meminitdata pgt_buf_top; |
17 | 17 | ||
18 | #endif /* _ASM_X86_INIT_32_H */ | 18 | #endif /* _ASM_X86_INIT_32_H */ |
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index 0b7228268a63..615fa9061b57 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h | |||
@@ -123,10 +123,6 @@ extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, | |||
123 | int vector); | 123 | int vector); |
124 | extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, | 124 | extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, |
125 | int vector); | 125 | int vector); |
126 | extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, | ||
127 | int vector); | ||
128 | extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, | ||
129 | int vector); | ||
130 | 126 | ||
131 | /* Avoid include hell */ | 127 | /* Avoid include hell */ |
132 | #define NMI_VECTOR 0x02 | 128 | #define NMI_VECTOR 0x02 |
@@ -150,6 +146,10 @@ static inline void __default_local_send_IPI_all(int vector) | |||
150 | } | 146 | } |
151 | 147 | ||
152 | #ifdef CONFIG_X86_32 | 148 | #ifdef CONFIG_X86_32 |
149 | extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, | ||
150 | int vector); | ||
151 | extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, | ||
152 | int vector); | ||
153 | extern void default_send_IPI_mask_logical(const struct cpumask *mask, | 153 | extern void default_send_IPI_mask_logical(const struct cpumask *mask, |
154 | int vector); | 154 | int vector); |
155 | extern void default_send_IPI_allbutself(int vector); | 155 | extern void default_send_IPI_allbutself(int vector); |
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 6af0894dafb4..6e976ee3b3ef 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h | |||
@@ -1,6 +1,7 @@ | |||
1 | #ifndef _ASM_X86_IRQ_VECTORS_H | 1 | #ifndef _ASM_X86_IRQ_VECTORS_H |
2 | #define _ASM_X86_IRQ_VECTORS_H | 2 | #define _ASM_X86_IRQ_VECTORS_H |
3 | 3 | ||
4 | #include <linux/threads.h> | ||
4 | /* | 5 | /* |
5 | * Linux IRQ vector layout. | 6 | * Linux IRQ vector layout. |
6 | * | 7 | * |
@@ -16,8 +17,8 @@ | |||
16 | * Vectors 0 ... 31 : system traps and exceptions - hardcoded events | 17 | * Vectors 0 ... 31 : system traps and exceptions - hardcoded events |
17 | * Vectors 32 ... 127 : device interrupts | 18 | * Vectors 32 ... 127 : device interrupts |
18 | * Vector 128 : legacy int80 syscall interface | 19 | * Vector 128 : legacy int80 syscall interface |
19 | * Vectors 129 ... 237 : device interrupts | 20 | * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts |
20 | * Vectors 238 ... 255 : special interrupts | 21 | * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts |
21 | * | 22 | * |
22 | * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. | 23 | * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. |
23 | * | 24 | * |
@@ -96,37 +97,43 @@ | |||
96 | #define THRESHOLD_APIC_VECTOR 0xf9 | 97 | #define THRESHOLD_APIC_VECTOR 0xf9 |
97 | #define REBOOT_VECTOR 0xf8 | 98 | #define REBOOT_VECTOR 0xf8 |
98 | 99 | ||
99 | /* f0-f7 used for spreading out TLB flushes: */ | ||
100 | #define INVALIDATE_TLB_VECTOR_END 0xf7 | ||
101 | #define INVALIDATE_TLB_VECTOR_START 0xf0 | ||
102 | #define NUM_INVALIDATE_TLB_VECTORS 8 | ||
103 | |||
104 | /* | ||
105 | * Local APIC timer IRQ vector is on a different priority level, | ||
106 | * to work around the 'lost local interrupt if more than 2 IRQ | ||
107 | * sources per level' errata. | ||
108 | */ | ||
109 | #define LOCAL_TIMER_VECTOR 0xef | ||
110 | |||
111 | /* | 100 | /* |
112 | * Generic system vector for platform specific use | 101 | * Generic system vector for platform specific use |
113 | */ | 102 | */ |
114 | #define X86_PLATFORM_IPI_VECTOR 0xed | 103 | #define X86_PLATFORM_IPI_VECTOR 0xf7 |
115 | 104 | ||
116 | /* | 105 | /* |
117 | * IRQ work vector: | 106 | * IRQ work vector: |
118 | */ | 107 | */ |
119 | #define IRQ_WORK_VECTOR 0xec | 108 | #define IRQ_WORK_VECTOR 0xf6 |
120 | 109 | ||
121 | #define UV_BAU_MESSAGE 0xea | 110 | #define UV_BAU_MESSAGE 0xf5 |
122 | 111 | ||
123 | /* | 112 | /* |
124 | * Self IPI vector for machine checks | 113 | * Self IPI vector for machine checks |
125 | */ | 114 | */ |
126 | #define MCE_SELF_VECTOR 0xeb | 115 | #define MCE_SELF_VECTOR 0xf4 |
127 | 116 | ||
128 | /* Xen vector callback to receive events in a HVM domain */ | 117 | /* Xen vector callback to receive events in a HVM domain */ |
129 | #define XEN_HVM_EVTCHN_CALLBACK 0xe9 | 118 | #define XEN_HVM_EVTCHN_CALLBACK 0xf3 |
119 | |||
120 | /* | ||
121 | * Local APIC timer IRQ vector is on a different priority level, | ||
122 | * to work around the 'lost local interrupt if more than 2 IRQ | ||
123 | * sources per level' errata. | ||
124 | */ | ||
125 | #define LOCAL_TIMER_VECTOR 0xef | ||
126 | |||
127 | /* up to 32 vectors used for spreading out TLB flushes: */ | ||
128 | #if NR_CPUS <= 32 | ||
129 | # define NUM_INVALIDATE_TLB_VECTORS (NR_CPUS) | ||
130 | #else | ||
131 | # define NUM_INVALIDATE_TLB_VECTORS (32) | ||
132 | #endif | ||
133 | |||
134 | #define INVALIDATE_TLB_VECTOR_END (0xee) | ||
135 | #define INVALIDATE_TLB_VECTOR_START \ | ||
136 | (INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1) | ||
130 | 137 | ||
131 | #define NR_VECTORS 256 | 138 | #define NR_VECTORS 256 |
132 | 139 | ||
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 0c90dd9f0505..9c7d95f6174b 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h | |||
@@ -25,7 +25,6 @@ extern int pic_mode; | |||
25 | #define MAX_IRQ_SOURCES 256 | 25 | #define MAX_IRQ_SOURCES 256 |
26 | 26 | ||
27 | extern unsigned int def_to_bigsmp; | 27 | extern unsigned int def_to_bigsmp; |
28 | extern u8 apicid_2_node[]; | ||
29 | 28 | ||
30 | #ifdef CONFIG_X86_NUMAQ | 29 | #ifdef CONFIG_X86_NUMAQ |
31 | extern int mp_bus_id_to_node[MAX_MP_BUSSES]; | 30 | extern int mp_bus_id_to_node[MAX_MP_BUSSES]; |
@@ -33,8 +32,6 @@ extern int mp_bus_id_to_local[MAX_MP_BUSSES]; | |||
33 | extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; | 32 | extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; |
34 | #endif | 33 | #endif |
35 | 34 | ||
36 | #define MAX_APICID 256 | ||
37 | |||
38 | #else /* CONFIG_X86_64: */ | 35 | #else /* CONFIG_X86_64: */ |
39 | 36 | ||
40 | #define MAX_MP_BUSSES 256 | 37 | #define MAX_MP_BUSSES 256 |
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 27da400d3138..3d4dab43c994 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h | |||
@@ -1,5 +1,57 @@ | |||
1 | #ifndef _ASM_X86_NUMA_H | ||
2 | #define _ASM_X86_NUMA_H | ||
3 | |||
4 | #include <asm/topology.h> | ||
5 | #include <asm/apicdef.h> | ||
6 | |||
7 | #ifdef CONFIG_NUMA | ||
8 | |||
9 | #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) | ||
10 | |||
11 | /* | ||
12 | * __apicid_to_node[] stores the raw mapping between physical apicid and | ||
13 | * node and is used to initialize cpu_to_node mapping. | ||
14 | * | ||
15 | * The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus | ||
16 | * should be accessed by the accessors - set_apicid_to_node() and | ||
17 | * numa_cpu_node(). | ||
18 | */ | ||
19 | extern s16 __apicid_to_node[MAX_LOCAL_APIC]; | ||
20 | |||
21 | static inline void set_apicid_to_node(int apicid, s16 node) | ||
22 | { | ||
23 | __apicid_to_node[apicid] = node; | ||
24 | } | ||
25 | #else /* CONFIG_NUMA */ | ||
26 | static inline void set_apicid_to_node(int apicid, s16 node) | ||
27 | { | ||
28 | } | ||
29 | #endif /* CONFIG_NUMA */ | ||
30 | |||
1 | #ifdef CONFIG_X86_32 | 31 | #ifdef CONFIG_X86_32 |
2 | # include "numa_32.h" | 32 | # include "numa_32.h" |
3 | #else | 33 | #else |
4 | # include "numa_64.h" | 34 | # include "numa_64.h" |
5 | #endif | 35 | #endif |
36 | |||
37 | #ifdef CONFIG_NUMA | ||
38 | extern void __cpuinit numa_set_node(int cpu, int node); | ||
39 | extern void __cpuinit numa_clear_node(int cpu); | ||
40 | extern void __init numa_init_array(void); | ||
41 | extern void __init init_cpu_to_node(void); | ||
42 | extern void __cpuinit numa_add_cpu(int cpu); | ||
43 | extern void __cpuinit numa_remove_cpu(int cpu); | ||
44 | #else /* CONFIG_NUMA */ | ||
45 | static inline void numa_set_node(int cpu, int node) { } | ||
46 | static inline void numa_clear_node(int cpu) { } | ||
47 | static inline void numa_init_array(void) { } | ||
48 | static inline void init_cpu_to_node(void) { } | ||
49 | static inline void numa_add_cpu(int cpu) { } | ||
50 | static inline void numa_remove_cpu(int cpu) { } | ||
51 | #endif /* CONFIG_NUMA */ | ||
52 | |||
53 | #ifdef CONFIG_DEBUG_PER_CPU_MAPS | ||
54 | struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable); | ||
55 | #endif | ||
56 | |||
57 | #endif /* _ASM_X86_NUMA_H */ | ||
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index b0ef2b449a9d..c6beed1ef103 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h | |||
@@ -4,7 +4,12 @@ | |||
4 | extern int numa_off; | 4 | extern int numa_off; |
5 | 5 | ||
6 | extern int pxm_to_nid(int pxm); | 6 | extern int pxm_to_nid(int pxm); |
7 | extern void numa_remove_cpu(int cpu); | 7 | |
8 | #ifdef CONFIG_NUMA | ||
9 | extern int __cpuinit numa_cpu_node(int cpu); | ||
10 | #else /* CONFIG_NUMA */ | ||
11 | static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } | ||
12 | #endif /* CONFIG_NUMA */ | ||
8 | 13 | ||
9 | #ifdef CONFIG_HIGHMEM | 14 | #ifdef CONFIG_HIGHMEM |
10 | extern void set_highmem_pages_init(void); | 15 | extern void set_highmem_pages_init(void); |
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 0493be39607c..344eb1790b46 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h | |||
@@ -2,23 +2,16 @@ | |||
2 | #define _ASM_X86_NUMA_64_H | 2 | #define _ASM_X86_NUMA_64_H |
3 | 3 | ||
4 | #include <linux/nodemask.h> | 4 | #include <linux/nodemask.h> |
5 | #include <asm/apicdef.h> | ||
6 | 5 | ||
7 | struct bootnode { | 6 | struct bootnode { |
8 | u64 start; | 7 | u64 start; |
9 | u64 end; | 8 | u64 end; |
10 | }; | 9 | }; |
11 | 10 | ||
12 | extern int compute_hash_shift(struct bootnode *nodes, int numblks, | ||
13 | int *nodeids); | ||
14 | |||
15 | #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) | 11 | #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) |
16 | 12 | ||
17 | extern void numa_init_array(void); | ||
18 | extern int numa_off; | 13 | extern int numa_off; |
19 | 14 | ||
20 | extern s16 apicid_to_node[MAX_LOCAL_APIC]; | ||
21 | |||
22 | extern unsigned long numa_free_all_bootmem(void); | 15 | extern unsigned long numa_free_all_bootmem(void); |
23 | extern void setup_node_bootmem(int nodeid, unsigned long start, | 16 | extern void setup_node_bootmem(int nodeid, unsigned long start, |
24 | unsigned long end); | 17 | unsigned long end); |
@@ -31,11 +24,11 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, | |||
31 | */ | 24 | */ |
32 | #define NODE_MIN_SIZE (4*1024*1024) | 25 | #define NODE_MIN_SIZE (4*1024*1024) |
33 | 26 | ||
34 | extern void __init init_cpu_to_node(void); | 27 | extern nodemask_t numa_nodes_parsed __initdata; |
35 | extern void __cpuinit numa_set_node(int cpu, int node); | 28 | |
36 | extern void __cpuinit numa_clear_node(int cpu); | 29 | extern int __cpuinit numa_cpu_node(int cpu); |
37 | extern void __cpuinit numa_add_cpu(int cpu); | 30 | extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); |
38 | extern void __cpuinit numa_remove_cpu(int cpu); | 31 | extern void __init numa_set_distance(int from, int to, int distance); |
39 | 32 | ||
40 | #ifdef CONFIG_NUMA_EMU | 33 | #ifdef CONFIG_NUMA_EMU |
41 | #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) | 34 | #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) |
@@ -43,11 +36,7 @@ extern void __cpuinit numa_remove_cpu(int cpu); | |||
43 | void numa_emu_cmdline(char *); | 36 | void numa_emu_cmdline(char *); |
44 | #endif /* CONFIG_NUMA_EMU */ | 37 | #endif /* CONFIG_NUMA_EMU */ |
45 | #else | 38 | #else |
46 | static inline void init_cpu_to_node(void) { } | 39 | static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; } |
47 | static inline void numa_set_node(int cpu, int node) { } | ||
48 | static inline void numa_clear_node(int cpu) { } | ||
49 | static inline void numa_add_cpu(int cpu, int node) { } | ||
50 | static inline void numa_remove_cpu(int cpu) { } | ||
51 | #endif | 40 | #endif |
52 | 41 | ||
53 | #endif /* _ASM_X86_NUMA_64_H */ | 42 | #endif /* _ASM_X86_NUMA_64_H */ |
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 1df66211fd1b..bce688d54c12 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define _ASM_X86_PAGE_DEFS_H | 2 | #define _ASM_X86_PAGE_DEFS_H |
3 | 3 | ||
4 | #include <linux/const.h> | 4 | #include <linux/const.h> |
5 | #include <linux/types.h> | ||
5 | 6 | ||
6 | /* PAGE_SHIFT determines the page size */ | 7 | /* PAGE_SHIFT determines the page size */ |
7 | #define PAGE_SHIFT 12 | 8 | #define PAGE_SHIFT 12 |
@@ -45,11 +46,15 @@ extern int devmem_is_allowed(unsigned long pagenr); | |||
45 | extern unsigned long max_low_pfn_mapped; | 46 | extern unsigned long max_low_pfn_mapped; |
46 | extern unsigned long max_pfn_mapped; | 47 | extern unsigned long max_pfn_mapped; |
47 | 48 | ||
49 | static inline phys_addr_t get_max_mapped(void) | ||
50 | { | ||
51 | return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; | ||
52 | } | ||
53 | |||
48 | extern unsigned long init_memory_mapping(unsigned long start, | 54 | extern unsigned long init_memory_mapping(unsigned long start, |
49 | unsigned long end); | 55 | unsigned long end); |
50 | 56 | ||
51 | extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn, | 57 | extern void initmem_init(void); |
52 | int acpi, int k8); | ||
53 | extern void free_initmem(void); | 58 | extern void free_initmem(void); |
54 | 59 | ||
55 | #endif /* !__ASSEMBLY__ */ | 60 | #endif /* !__ASSEMBLY__ */ |
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 99fa8b47381e..73b11bc0ae6f 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h | |||
@@ -55,6 +55,9 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) | |||
55 | 55 | ||
56 | DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); | 56 | DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); |
57 | DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); | 57 | DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); |
58 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) | ||
59 | DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid); | ||
60 | #endif | ||
58 | 61 | ||
59 | /* Static state in head.S used to set up a CPU */ | 62 | /* Static state in head.S used to set up a CPU */ |
60 | extern unsigned long stack_start; /* Initial stack pointer address */ | 63 | extern unsigned long stack_start; /* Initial stack pointer address */ |
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 21899cc31e52..910a7084f7f2 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h | |||
@@ -47,21 +47,6 @@ | |||
47 | 47 | ||
48 | #include <asm/mpspec.h> | 48 | #include <asm/mpspec.h> |
49 | 49 | ||
50 | #ifdef CONFIG_X86_32 | ||
51 | |||
52 | /* Mappings between logical cpu number and node number */ | ||
53 | extern int cpu_to_node_map[]; | ||
54 | |||
55 | /* Returns the number of the node containing CPU 'cpu' */ | ||
56 | static inline int __cpu_to_node(int cpu) | ||
57 | { | ||
58 | return cpu_to_node_map[cpu]; | ||
59 | } | ||
60 | #define early_cpu_to_node __cpu_to_node | ||
61 | #define cpu_to_node __cpu_to_node | ||
62 | |||
63 | #else /* CONFIG_X86_64 */ | ||
64 | |||
65 | /* Mappings between logical cpu number and node number */ | 50 | /* Mappings between logical cpu number and node number */ |
66 | DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); | 51 | DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); |
67 | 52 | ||
@@ -84,8 +69,6 @@ static inline int early_cpu_to_node(int cpu) | |||
84 | 69 | ||
85 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ | 70 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ |
86 | 71 | ||
87 | #endif /* CONFIG_X86_64 */ | ||
88 | |||
89 | /* Mappings between node number and cpus on that node. */ | 72 | /* Mappings between node number and cpus on that node. */ |
90 | extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; | 73 | extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; |
91 | 74 | ||
@@ -155,7 +138,7 @@ extern unsigned long node_remap_size[]; | |||
155 | .balance_interval = 1, \ | 138 | .balance_interval = 1, \ |
156 | } | 139 | } |
157 | 140 | ||
158 | #ifdef CONFIG_X86_64_ACPI_NUMA | 141 | #ifdef CONFIG_X86_64 |
159 | extern int __node_distance(int, int); | 142 | extern int __node_distance(int, int); |
160 | #define node_distance(a, b) __node_distance(a, b) | 143 | #define node_distance(a, b) __node_distance(a, b) |
161 | #endif | 144 | #endif |
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 3e6e2d68f761..9a966c579af5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c | |||
@@ -595,14 +595,8 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) | |||
595 | nid = acpi_get_node(handle); | 595 | nid = acpi_get_node(handle); |
596 | if (nid == -1 || !node_online(nid)) | 596 | if (nid == -1 || !node_online(nid)) |
597 | return; | 597 | return; |
598 | #ifdef CONFIG_X86_64 | 598 | set_apicid_to_node(physid, nid); |
599 | apicid_to_node[physid] = nid; | ||
600 | numa_set_node(cpu, nid); | 599 | numa_set_node(cpu, nid); |
601 | #else /* CONFIG_X86_32 */ | ||
602 | apicid_2_node[physid] = nid; | ||
603 | cpu_to_node_map[cpu] = nid; | ||
604 | #endif | ||
605 | |||
606 | #endif | 600 | #endif |
607 | } | 601 | } |
608 | 602 | ||
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 0a99f7198bc3..ed3c2e5b714a 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c | |||
@@ -12,7 +12,7 @@ | |||
12 | 12 | ||
13 | static u32 *flush_words; | 13 | static u32 *flush_words; |
14 | 14 | ||
15 | struct pci_device_id amd_nb_misc_ids[] = { | 15 | const struct pci_device_id amd_nb_misc_ids[] = { |
16 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, | 16 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, |
17 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, | 17 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, |
18 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, | 18 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, |
@@ -20,6 +20,11 @@ struct pci_device_id amd_nb_misc_ids[] = { | |||
20 | }; | 20 | }; |
21 | EXPORT_SYMBOL(amd_nb_misc_ids); | 21 | EXPORT_SYMBOL(amd_nb_misc_ids); |
22 | 22 | ||
23 | static struct pci_device_id amd_nb_link_ids[] = { | ||
24 | { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_LINK) }, | ||
25 | {} | ||
26 | }; | ||
27 | |||
23 | const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { | 28 | const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { |
24 | { 0x00, 0x18, 0x20 }, | 29 | { 0x00, 0x18, 0x20 }, |
25 | { 0xff, 0x00, 0x20 }, | 30 | { 0xff, 0x00, 0x20 }, |
@@ -31,7 +36,7 @@ struct amd_northbridge_info amd_northbridges; | |||
31 | EXPORT_SYMBOL(amd_northbridges); | 36 | EXPORT_SYMBOL(amd_northbridges); |
32 | 37 | ||
33 | static struct pci_dev *next_northbridge(struct pci_dev *dev, | 38 | static struct pci_dev *next_northbridge(struct pci_dev *dev, |
34 | struct pci_device_id *ids) | 39 | const struct pci_device_id *ids) |
35 | { | 40 | { |
36 | do { | 41 | do { |
37 | dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); | 42 | dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); |
@@ -45,7 +50,7 @@ int amd_cache_northbridges(void) | |||
45 | { | 50 | { |
46 | int i = 0; | 51 | int i = 0; |
47 | struct amd_northbridge *nb; | 52 | struct amd_northbridge *nb; |
48 | struct pci_dev *misc; | 53 | struct pci_dev *misc, *link; |
49 | 54 | ||
50 | if (amd_nb_num()) | 55 | if (amd_nb_num()) |
51 | return 0; | 56 | return 0; |
@@ -64,10 +69,12 @@ int amd_cache_northbridges(void) | |||
64 | amd_northbridges.nb = nb; | 69 | amd_northbridges.nb = nb; |
65 | amd_northbridges.num = i; | 70 | amd_northbridges.num = i; |
66 | 71 | ||
67 | misc = NULL; | 72 | link = misc = NULL; |
68 | for (i = 0; i != amd_nb_num(); i++) { | 73 | for (i = 0; i != amd_nb_num(); i++) { |
69 | node_to_amd_nb(i)->misc = misc = | 74 | node_to_amd_nb(i)->misc = misc = |
70 | next_northbridge(misc, amd_nb_misc_ids); | 75 | next_northbridge(misc, amd_nb_misc_ids); |
76 | node_to_amd_nb(i)->link = link = | ||
77 | next_northbridge(link, amd_nb_link_ids); | ||
71 | } | 78 | } |
72 | 79 | ||
73 | /* some CPU families (e.g. family 0x11) do not support GART */ | 80 | /* some CPU families (e.g. family 0x11) do not support GART */ |
@@ -85,6 +92,13 @@ int amd_cache_northbridges(void) | |||
85 | boot_cpu_data.x86_mask >= 0x1)) | 92 | boot_cpu_data.x86_mask >= 0x1)) |
86 | amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; | 93 | amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; |
87 | 94 | ||
95 | if (boot_cpu_data.x86 == 0x15) | ||
96 | amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; | ||
97 | |||
98 | /* L3 cache partitioning is supported on family 0x15 */ | ||
99 | if (boot_cpu_data.x86 == 0x15) | ||
100 | amd_northbridges.flags |= AMD_NB_L3_PARTITIONING; | ||
101 | |||
88 | return 0; | 102 | return 0; |
89 | } | 103 | } |
90 | EXPORT_SYMBOL_GPL(amd_cache_northbridges); | 104 | EXPORT_SYMBOL_GPL(amd_cache_northbridges); |
@@ -93,8 +107,9 @@ EXPORT_SYMBOL_GPL(amd_cache_northbridges); | |||
93 | they're useless anyways */ | 107 | they're useless anyways */ |
94 | int __init early_is_amd_nb(u32 device) | 108 | int __init early_is_amd_nb(u32 device) |
95 | { | 109 | { |
96 | struct pci_device_id *id; | 110 | const struct pci_device_id *id; |
97 | u32 vendor = device & 0xffff; | 111 | u32 vendor = device & 0xffff; |
112 | |||
98 | device >>= 16; | 113 | device >>= 16; |
99 | for (id = amd_nb_misc_ids; id->vendor; id++) | 114 | for (id = amd_nb_misc_ids; id->vendor; id++) |
100 | if (vendor == id->vendor && device == id->device) | 115 | if (vendor == id->vendor && device == id->device) |
@@ -102,6 +117,65 @@ int __init early_is_amd_nb(u32 device) | |||
102 | return 0; | 117 | return 0; |
103 | } | 118 | } |
104 | 119 | ||
120 | int amd_get_subcaches(int cpu) | ||
121 | { | ||
122 | struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; | ||
123 | unsigned int mask; | ||
124 | int cuid = 0; | ||
125 | |||
126 | if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) | ||
127 | return 0; | ||
128 | |||
129 | pci_read_config_dword(link, 0x1d4, &mask); | ||
130 | |||
131 | #ifdef CONFIG_SMP | ||
132 | cuid = cpu_data(cpu).compute_unit_id; | ||
133 | #endif | ||
134 | return (mask >> (4 * cuid)) & 0xf; | ||
135 | } | ||
136 | |||
137 | int amd_set_subcaches(int cpu, int mask) | ||
138 | { | ||
139 | static unsigned int reset, ban; | ||
140 | struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); | ||
141 | unsigned int reg; | ||
142 | int cuid = 0; | ||
143 | |||
144 | if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) | ||
145 | return -EINVAL; | ||
146 | |||
147 | /* if necessary, collect reset state of L3 partitioning and BAN mode */ | ||
148 | if (reset == 0) { | ||
149 | pci_read_config_dword(nb->link, 0x1d4, &reset); | ||
150 | pci_read_config_dword(nb->misc, 0x1b8, &ban); | ||
151 | ban &= 0x180000; | ||
152 | } | ||
153 | |||
154 | /* deactivate BAN mode if any subcaches are to be disabled */ | ||
155 | if (mask != 0xf) { | ||
156 | pci_read_config_dword(nb->misc, 0x1b8, ®); | ||
157 | pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); | ||
158 | } | ||
159 | |||
160 | #ifdef CONFIG_SMP | ||
161 | cuid = cpu_data(cpu).compute_unit_id; | ||
162 | #endif | ||
163 | mask <<= 4 * cuid; | ||
164 | mask |= (0xf ^ (1 << cuid)) << 26; | ||
165 | |||
166 | pci_write_config_dword(nb->link, 0x1d4, mask); | ||
167 | |||
168 | /* reset BAN mode if L3 partitioning returned to reset state */ | ||
169 | pci_read_config_dword(nb->link, 0x1d4, ®); | ||
170 | if (reg == reset) { | ||
171 | pci_read_config_dword(nb->misc, 0x1b8, ®); | ||
172 | reg &= ~0x180000; | ||
173 | pci_write_config_dword(nb->misc, 0x1b8, reg | ban); | ||
174 | } | ||
175 | |||
176 | return 0; | ||
177 | } | ||
178 | |||
105 | int amd_cache_gart(void) | 179 | int amd_cache_gart(void) |
106 | { | 180 | { |
107 | int i; | 181 | int i; |
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 5955a7800a96..7b1e8e10b89c 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c | |||
@@ -13,7 +13,7 @@ | |||
13 | #include <linux/kernel.h> | 13 | #include <linux/kernel.h> |
14 | #include <linux/types.h> | 14 | #include <linux/types.h> |
15 | #include <linux/init.h> | 15 | #include <linux/init.h> |
16 | #include <linux/bootmem.h> | 16 | #include <linux/memblock.h> |
17 | #include <linux/mmzone.h> | 17 | #include <linux/mmzone.h> |
18 | #include <linux/pci_ids.h> | 18 | #include <linux/pci_ids.h> |
19 | #include <linux/pci.h> | 19 | #include <linux/pci.h> |
@@ -57,7 +57,7 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) | |||
57 | static u32 __init allocate_aperture(void) | 57 | static u32 __init allocate_aperture(void) |
58 | { | 58 | { |
59 | u32 aper_size; | 59 | u32 aper_size; |
60 | void *p; | 60 | unsigned long addr; |
61 | 61 | ||
62 | /* aper_size should <= 1G */ | 62 | /* aper_size should <= 1G */ |
63 | if (fallback_aper_order > 5) | 63 | if (fallback_aper_order > 5) |
@@ -83,27 +83,26 @@ static u32 __init allocate_aperture(void) | |||
83 | * so don't use 512M below as gart iommu, leave the space for kernel | 83 | * so don't use 512M below as gart iommu, leave the space for kernel |
84 | * code for safe | 84 | * code for safe |
85 | */ | 85 | */ |
86 | p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); | 86 | addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20); |
87 | if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) { | ||
88 | printk(KERN_ERR | ||
89 | "Cannot allocate aperture memory hole (%lx,%uK)\n", | ||
90 | addr, aper_size>>10); | ||
91 | return 0; | ||
92 | } | ||
93 | memblock_x86_reserve_range(addr, addr + aper_size, "aperture64"); | ||
87 | /* | 94 | /* |
88 | * Kmemleak should not scan this block as it may not be mapped via the | 95 | * Kmemleak should not scan this block as it may not be mapped via the |
89 | * kernel direct mapping. | 96 | * kernel direct mapping. |
90 | */ | 97 | */ |
91 | kmemleak_ignore(p); | 98 | kmemleak_ignore(phys_to_virt(addr)); |
92 | if (!p || __pa(p)+aper_size > 0xffffffff) { | ||
93 | printk(KERN_ERR | ||
94 | "Cannot allocate aperture memory hole (%p,%uK)\n", | ||
95 | p, aper_size>>10); | ||
96 | if (p) | ||
97 | free_bootmem(__pa(p), aper_size); | ||
98 | return 0; | ||
99 | } | ||
100 | printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", | 99 | printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", |
101 | aper_size >> 10, __pa(p)); | 100 | aper_size >> 10, addr); |
102 | insert_aperture_resource((u32)__pa(p), aper_size); | 101 | insert_aperture_resource((u32)addr, aper_size); |
103 | register_nosave_region((u32)__pa(p) >> PAGE_SHIFT, | 102 | register_nosave_region(addr >> PAGE_SHIFT, |
104 | (u32)__pa(p+aper_size) >> PAGE_SHIFT); | 103 | (addr+aper_size) >> PAGE_SHIFT); |
105 | 104 | ||
106 | return (u32)__pa(p); | 105 | return (u32)addr; |
107 | } | 106 | } |
108 | 107 | ||
109 | 108 | ||
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 48dcd2e83b46..562a8325cc1c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
@@ -79,6 +79,15 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); | |||
79 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); | 79 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); |
80 | 80 | ||
81 | #ifdef CONFIG_X86_32 | 81 | #ifdef CONFIG_X86_32 |
82 | |||
83 | /* | ||
84 | * On x86_32, the mapping between cpu and logical apicid may vary | ||
85 | * depending on apic in use. The following early percpu variable is | ||
86 | * used for the mapping. This is where the behaviors of x86_64 and 32 | ||
87 | * actually diverge. Let's keep it ugly for now. | ||
88 | */ | ||
89 | DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID); | ||
90 | |||
82 | /* | 91 | /* |
83 | * Knob to control our willingness to enable the local APIC. | 92 | * Knob to control our willingness to enable the local APIC. |
84 | * | 93 | * |
@@ -1238,6 +1247,19 @@ void __cpuinit setup_local_APIC(void) | |||
1238 | */ | 1247 | */ |
1239 | apic->init_apic_ldr(); | 1248 | apic->init_apic_ldr(); |
1240 | 1249 | ||
1250 | #ifdef CONFIG_X86_32 | ||
1251 | /* | ||
1252 | * APIC LDR is initialized. If logical_apicid mapping was | ||
1253 | * initialized during get_smp_config(), make sure it matches the | ||
1254 | * actual value. | ||
1255 | */ | ||
1256 | i = early_per_cpu(x86_cpu_to_logical_apicid, cpu); | ||
1257 | WARN_ON(i != BAD_APICID && i != logical_smp_processor_id()); | ||
1258 | /* always use the value from LDR */ | ||
1259 | early_per_cpu(x86_cpu_to_logical_apicid, cpu) = | ||
1260 | logical_smp_processor_id(); | ||
1261 | #endif | ||
1262 | |||
1241 | /* | 1263 | /* |
1242 | * Set Task Priority to 'accept all'. We never change this | 1264 | * Set Task Priority to 'accept all'. We never change this |
1243 | * later on. | 1265 | * later on. |
@@ -1979,7 +2001,10 @@ void __cpuinit generic_processor_info(int apicid, int version) | |||
1979 | early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; | 2001 | early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; |
1980 | early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; | 2002 | early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; |
1981 | #endif | 2003 | #endif |
1982 | 2004 | #ifdef CONFIG_X86_32 | |
2005 | early_per_cpu(x86_cpu_to_logical_apicid, cpu) = | ||
2006 | apic->x86_32_early_logical_apicid(cpu); | ||
2007 | #endif | ||
1983 | set_cpu_possible(cpu, true); | 2008 | set_cpu_possible(cpu, true); |
1984 | set_cpu_present(cpu, true); | 2009 | set_cpu_present(cpu, true); |
1985 | } | 2010 | } |
@@ -2000,10 +2025,14 @@ void default_init_apic_ldr(void) | |||
2000 | } | 2025 | } |
2001 | 2026 | ||
2002 | #ifdef CONFIG_X86_32 | 2027 | #ifdef CONFIG_X86_32 |
2003 | int default_apicid_to_node(int logical_apicid) | 2028 | int default_x86_32_numa_cpu_node(int cpu) |
2004 | { | 2029 | { |
2005 | #ifdef CONFIG_SMP | 2030 | #ifdef CONFIG_NUMA |
2006 | return apicid_2_node[hard_smp_processor_id()]; | 2031 | int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); |
2032 | |||
2033 | if (apicid != BAD_APICID) | ||
2034 | return __apicid_to_node[apicid]; | ||
2035 | return NUMA_NO_NODE; | ||
2007 | #else | 2036 | #else |
2008 | return 0; | 2037 | return 0; |
2009 | #endif | 2038 | #endif |
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 09d3b17ce0c2..5652d31fe108 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c | |||
@@ -185,8 +185,6 @@ struct apic apic_flat = { | |||
185 | .ioapic_phys_id_map = NULL, | 185 | .ioapic_phys_id_map = NULL, |
186 | .setup_apic_routing = NULL, | 186 | .setup_apic_routing = NULL, |
187 | .multi_timer_check = NULL, | 187 | .multi_timer_check = NULL, |
188 | .apicid_to_node = NULL, | ||
189 | .cpu_to_logical_apicid = NULL, | ||
190 | .cpu_present_to_apicid = default_cpu_present_to_apicid, | 188 | .cpu_present_to_apicid = default_cpu_present_to_apicid, |
191 | .apicid_to_cpu_present = NULL, | 189 | .apicid_to_cpu_present = NULL, |
192 | .setup_portio_remap = NULL, | 190 | .setup_portio_remap = NULL, |
@@ -337,8 +335,6 @@ struct apic apic_physflat = { | |||
337 | .ioapic_phys_id_map = NULL, | 335 | .ioapic_phys_id_map = NULL, |
338 | .setup_apic_routing = NULL, | 336 | .setup_apic_routing = NULL, |
339 | .multi_timer_check = NULL, | 337 | .multi_timer_check = NULL, |
340 | .apicid_to_node = NULL, | ||
341 | .cpu_to_logical_apicid = NULL, | ||
342 | .cpu_present_to_apicid = default_cpu_present_to_apicid, | 338 | .cpu_present_to_apicid = default_cpu_present_to_apicid, |
343 | .apicid_to_cpu_present = NULL, | 339 | .apicid_to_cpu_present = NULL, |
344 | .setup_portio_remap = NULL, | 340 | .setup_portio_remap = NULL, |
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index e31b9ffe25f5..f1baa2dc087a 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c | |||
@@ -54,11 +54,6 @@ static u64 noop_apic_icr_read(void) | |||
54 | return 0; | 54 | return 0; |
55 | } | 55 | } |
56 | 56 | ||
57 | static int noop_cpu_to_logical_apicid(int cpu) | ||
58 | { | ||
59 | return 0; | ||
60 | } | ||
61 | |||
62 | static int noop_phys_pkg_id(int cpuid_apic, int index_msb) | 57 | static int noop_phys_pkg_id(int cpuid_apic, int index_msb) |
63 | { | 58 | { |
64 | return 0; | 59 | return 0; |
@@ -113,12 +108,6 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) | |||
113 | cpumask_set_cpu(cpu, retmask); | 108 | cpumask_set_cpu(cpu, retmask); |
114 | } | 109 | } |
115 | 110 | ||
116 | int noop_apicid_to_node(int logical_apicid) | ||
117 | { | ||
118 | /* we're always on node 0 */ | ||
119 | return 0; | ||
120 | } | ||
121 | |||
122 | static u32 noop_apic_read(u32 reg) | 111 | static u32 noop_apic_read(u32 reg) |
123 | { | 112 | { |
124 | WARN_ON_ONCE((cpu_has_apic && !disable_apic)); | 113 | WARN_ON_ONCE((cpu_has_apic && !disable_apic)); |
@@ -130,6 +119,14 @@ static void noop_apic_write(u32 reg, u32 v) | |||
130 | WARN_ON_ONCE(cpu_has_apic && !disable_apic); | 119 | WARN_ON_ONCE(cpu_has_apic && !disable_apic); |
131 | } | 120 | } |
132 | 121 | ||
122 | #ifdef CONFIG_X86_32 | ||
123 | static int noop_x86_32_numa_cpu_node(int cpu) | ||
124 | { | ||
125 | /* we're always on node 0 */ | ||
126 | return 0; | ||
127 | } | ||
128 | #endif | ||
129 | |||
133 | struct apic apic_noop = { | 130 | struct apic apic_noop = { |
134 | .name = "noop", | 131 | .name = "noop", |
135 | .probe = noop_probe, | 132 | .probe = noop_probe, |
@@ -153,9 +150,7 @@ struct apic apic_noop = { | |||
153 | .ioapic_phys_id_map = default_ioapic_phys_id_map, | 150 | .ioapic_phys_id_map = default_ioapic_phys_id_map, |
154 | .setup_apic_routing = NULL, | 151 | .setup_apic_routing = NULL, |
155 | .multi_timer_check = NULL, | 152 | .multi_timer_check = NULL, |
156 | .apicid_to_node = noop_apicid_to_node, | ||
157 | 153 | ||
158 | .cpu_to_logical_apicid = noop_cpu_to_logical_apicid, | ||
159 | .cpu_present_to_apicid = default_cpu_present_to_apicid, | 154 | .cpu_present_to_apicid = default_cpu_present_to_apicid, |
160 | .apicid_to_cpu_present = physid_set_mask_of_physid, | 155 | .apicid_to_cpu_present = physid_set_mask_of_physid, |
161 | 156 | ||
@@ -197,4 +192,9 @@ struct apic apic_noop = { | |||
197 | .icr_write = noop_apic_icr_write, | 192 | .icr_write = noop_apic_icr_write, |
198 | .wait_icr_idle = noop_apic_wait_icr_idle, | 193 | .wait_icr_idle = noop_apic_wait_icr_idle, |
199 | .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, | 194 | .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, |
195 | |||
196 | #ifdef CONFIG_X86_32 | ||
197 | .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, | ||
198 | .x86_32_numa_cpu_node = noop_x86_32_numa_cpu_node, | ||
199 | #endif | ||
200 | }; | 200 | }; |
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index cb804c5091b9..541a2e431659 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c | |||
@@ -45,6 +45,12 @@ static unsigned long bigsmp_check_apicid_present(int bit) | |||
45 | return 1; | 45 | return 1; |
46 | } | 46 | } |
47 | 47 | ||
48 | static int bigsmp_early_logical_apicid(int cpu) | ||
49 | { | ||
50 | /* on bigsmp, logical apicid is the same as physical */ | ||
51 | return early_per_cpu(x86_cpu_to_apicid, cpu); | ||
52 | } | ||
53 | |||
48 | static inline unsigned long calculate_ldr(int cpu) | 54 | static inline unsigned long calculate_ldr(int cpu) |
49 | { | 55 | { |
50 | unsigned long val, id; | 56 | unsigned long val, id; |
@@ -80,11 +86,6 @@ static void bigsmp_setup_apic_routing(void) | |||
80 | nr_ioapics); | 86 | nr_ioapics); |
81 | } | 87 | } |
82 | 88 | ||
83 | static int bigsmp_apicid_to_node(int logical_apicid) | ||
84 | { | ||
85 | return apicid_2_node[hard_smp_processor_id()]; | ||
86 | } | ||
87 | |||
88 | static int bigsmp_cpu_present_to_apicid(int mps_cpu) | 89 | static int bigsmp_cpu_present_to_apicid(int mps_cpu) |
89 | { | 90 | { |
90 | if (mps_cpu < nr_cpu_ids) | 91 | if (mps_cpu < nr_cpu_ids) |
@@ -93,14 +94,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu) | |||
93 | return BAD_APICID; | 94 | return BAD_APICID; |
94 | } | 95 | } |
95 | 96 | ||
96 | /* Mapping from cpu number to logical apicid */ | ||
97 | static inline int bigsmp_cpu_to_logical_apicid(int cpu) | ||
98 | { | ||
99 | if (cpu >= nr_cpu_ids) | ||
100 | return BAD_APICID; | ||
101 | return cpu_physical_id(cpu); | ||
102 | } | ||
103 | |||
104 | static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) | 97 | static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) |
105 | { | 98 | { |
106 | /* For clustered we don't have a good way to do this yet - hack */ | 99 | /* For clustered we don't have a good way to do this yet - hack */ |
@@ -115,7 +108,11 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid) | |||
115 | /* As we are using single CPU as destination, pick only one CPU here */ | 108 | /* As we are using single CPU as destination, pick only one CPU here */ |
116 | static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) | 109 | static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) |
117 | { | 110 | { |
118 | return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask)); | 111 | int cpu = cpumask_first(cpumask); |
112 | |||
113 | if (cpu < nr_cpu_ids) | ||
114 | return cpu_physical_id(cpu); | ||
115 | return BAD_APICID; | ||
119 | } | 116 | } |
120 | 117 | ||
121 | static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, | 118 | static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, |
@@ -129,9 +126,9 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, | |||
129 | */ | 126 | */ |
130 | for_each_cpu_and(cpu, cpumask, andmask) { | 127 | for_each_cpu_and(cpu, cpumask, andmask) { |
131 | if (cpumask_test_cpu(cpu, cpu_online_mask)) | 128 | if (cpumask_test_cpu(cpu, cpu_online_mask)) |
132 | break; | 129 | return cpu_physical_id(cpu); |
133 | } | 130 | } |
134 | return bigsmp_cpu_to_logical_apicid(cpu); | 131 | return BAD_APICID; |
135 | } | 132 | } |
136 | 133 | ||
137 | static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) | 134 | static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) |
@@ -219,8 +216,6 @@ struct apic apic_bigsmp = { | |||
219 | .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, | 216 | .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, |
220 | .setup_apic_routing = bigsmp_setup_apic_routing, | 217 | .setup_apic_routing = bigsmp_setup_apic_routing, |
221 | .multi_timer_check = NULL, | 218 | .multi_timer_check = NULL, |
222 | .apicid_to_node = bigsmp_apicid_to_node, | ||
223 | .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, | ||
224 | .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, | 219 | .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, |
225 | .apicid_to_cpu_present = physid_set_mask_of_physid, | 220 | .apicid_to_cpu_present = physid_set_mask_of_physid, |
226 | .setup_portio_remap = NULL, | 221 | .setup_portio_remap = NULL, |
@@ -256,4 +251,7 @@ struct apic apic_bigsmp = { | |||
256 | .icr_write = native_apic_icr_write, | 251 | .icr_write = native_apic_icr_write, |
257 | .wait_icr_idle = native_apic_wait_icr_idle, | 252 | .wait_icr_idle = native_apic_wait_icr_idle, |
258 | .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, | 253 | .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, |
254 | |||
255 | .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, | ||
256 | .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, | ||
259 | }; | 257 | }; |
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 8593582d8022..3e9de4854c5b 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c | |||
@@ -460,6 +460,12 @@ static unsigned long es7000_check_apicid_present(int bit) | |||
460 | return physid_isset(bit, phys_cpu_present_map); | 460 | return physid_isset(bit, phys_cpu_present_map); |
461 | } | 461 | } |
462 | 462 | ||
463 | static int es7000_early_logical_apicid(int cpu) | ||
464 | { | ||
465 | /* on es7000, logical apicid is the same as physical */ | ||
466 | return early_per_cpu(x86_bios_cpu_apicid, cpu); | ||
467 | } | ||
468 | |||
463 | static unsigned long calculate_ldr(int cpu) | 469 | static unsigned long calculate_ldr(int cpu) |
464 | { | 470 | { |
465 | unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); | 471 | unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); |
@@ -504,12 +510,11 @@ static void es7000_setup_apic_routing(void) | |||
504 | nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); | 510 | nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); |
505 | } | 511 | } |
506 | 512 | ||
507 | static int es7000_apicid_to_node(int logical_apicid) | 513 | static int es7000_numa_cpu_node(int cpu) |
508 | { | 514 | { |
509 | return 0; | 515 | return 0; |
510 | } | 516 | } |
511 | 517 | ||
512 | |||
513 | static int es7000_cpu_present_to_apicid(int mps_cpu) | 518 | static int es7000_cpu_present_to_apicid(int mps_cpu) |
514 | { | 519 | { |
515 | if (!mps_cpu) | 520 | if (!mps_cpu) |
@@ -528,18 +533,6 @@ static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap) | |||
528 | ++cpu_id; | 533 | ++cpu_id; |
529 | } | 534 | } |
530 | 535 | ||
531 | /* Mapping from cpu number to logical apicid */ | ||
532 | static int es7000_cpu_to_logical_apicid(int cpu) | ||
533 | { | ||
534 | #ifdef CONFIG_SMP | ||
535 | if (cpu >= nr_cpu_ids) | ||
536 | return BAD_APICID; | ||
537 | return cpu_2_logical_apicid[cpu]; | ||
538 | #else | ||
539 | return logical_smp_processor_id(); | ||
540 | #endif | ||
541 | } | ||
542 | |||
543 | static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) | 536 | static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) |
544 | { | 537 | { |
545 | /* For clustered we don't have a good way to do this yet - hack */ | 538 | /* For clustered we don't have a good way to do this yet - hack */ |
@@ -561,7 +554,7 @@ static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) | |||
561 | * The cpus in the mask must all be on the apic cluster. | 554 | * The cpus in the mask must all be on the apic cluster. |
562 | */ | 555 | */ |
563 | for_each_cpu(cpu, cpumask) { | 556 | for_each_cpu(cpu, cpumask) { |
564 | int new_apicid = es7000_cpu_to_logical_apicid(cpu); | 557 | int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); |
565 | 558 | ||
566 | if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { | 559 | if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { |
567 | WARN(1, "Not a valid mask!"); | 560 | WARN(1, "Not a valid mask!"); |
@@ -578,7 +571,7 @@ static unsigned int | |||
578 | es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, | 571 | es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, |
579 | const struct cpumask *andmask) | 572 | const struct cpumask *andmask) |
580 | { | 573 | { |
581 | int apicid = es7000_cpu_to_logical_apicid(0); | 574 | int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); |
582 | cpumask_var_t cpumask; | 575 | cpumask_var_t cpumask; |
583 | 576 | ||
584 | if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) | 577 | if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) |
@@ -655,8 +648,6 @@ struct apic __refdata apic_es7000_cluster = { | |||
655 | .ioapic_phys_id_map = es7000_ioapic_phys_id_map, | 648 | .ioapic_phys_id_map = es7000_ioapic_phys_id_map, |
656 | .setup_apic_routing = es7000_setup_apic_routing, | 649 | .setup_apic_routing = es7000_setup_apic_routing, |
657 | .multi_timer_check = NULL, | 650 | .multi_timer_check = NULL, |
658 | .apicid_to_node = es7000_apicid_to_node, | ||
659 | .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid, | ||
660 | .cpu_present_to_apicid = es7000_cpu_present_to_apicid, | 651 | .cpu_present_to_apicid = es7000_cpu_present_to_apicid, |
661 | .apicid_to_cpu_present = es7000_apicid_to_cpu_present, | 652 | .apicid_to_cpu_present = es7000_apicid_to_cpu_present, |
662 | .setup_portio_remap = NULL, | 653 | .setup_portio_remap = NULL, |
@@ -695,6 +686,9 @@ struct apic __refdata apic_es7000_cluster = { | |||
695 | .icr_write = native_apic_icr_write, | 686 | .icr_write = native_apic_icr_write, |
696 | .wait_icr_idle = native_apic_wait_icr_idle, | 687 | .wait_icr_idle = native_apic_wait_icr_idle, |
697 | .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, | 688 | .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, |
689 | |||
690 | .x86_32_early_logical_apicid = es7000_early_logical_apicid, | ||
691 | .x86_32_numa_cpu_node = es7000_numa_cpu_node, | ||
698 | }; | 692 | }; |
699 | 693 | ||
700 | struct apic __refdata apic_es7000 = { | 694 | struct apic __refdata apic_es7000 = { |
@@ -720,8 +714,6 @@ struct apic __refdata apic_es7000 = { | |||
720 | .ioapic_phys_id_map = es7000_ioapic_phys_id_map, | 714 | .ioapic_phys_id_map = es7000_ioapic_phys_id_map, |
721 | .setup_apic_routing = es7000_setup_apic_routing, | 715 | .setup_apic_routing = es7000_setup_apic_routing, |
722 | .multi_timer_check = NULL, | 716 | .multi_timer_check = NULL, |
723 | .apicid_to_node = es7000_apicid_to_node, | ||
724 | .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid, | ||
725 | .cpu_present_to_apicid = es7000_cpu_present_to_apicid, | 717 | .cpu_present_to_apicid = es7000_cpu_present_to_apicid, |
726 | .apicid_to_cpu_present = es7000_apicid_to_cpu_present, | 718 | .apicid_to_cpu_present = es7000_apicid_to_cpu_present, |
727 | .setup_portio_remap = NULL, | 719 | .setup_portio_remap = NULL, |
@@ -758,4 +750,7 @@ struct apic __refdata apic_es7000 = { | |||
758 | .icr_write = native_apic_icr_write, | 750 | .icr_write = native_apic_icr_write, |
759 | .wait_icr_idle = native_apic_wait_icr_idle, | 751 | .wait_icr_idle = native_apic_wait_icr_idle, |
760 | .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, | 752 | .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, |
753 | |||
754 | .x86_32_early_logical_apicid = es7000_early_logical_apicid, | ||
755 | .x86_32_numa_cpu_node = es7000_numa_cpu_node, | ||
761 | }; | 756 | }; |
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 08385e090a6f..cce91bf26676 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c | |||
@@ -56,6 +56,8 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, | |||
56 | local_irq_restore(flags); | 56 | local_irq_restore(flags); |
57 | } | 57 | } |
58 | 58 | ||
59 | #ifdef CONFIG_X86_32 | ||
60 | |||
59 | void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, | 61 | void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, |
60 | int vector) | 62 | int vector) |
61 | { | 63 | { |
@@ -71,8 +73,8 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, | |||
71 | local_irq_save(flags); | 73 | local_irq_save(flags); |
72 | for_each_cpu(query_cpu, mask) | 74 | for_each_cpu(query_cpu, mask) |
73 | __default_send_IPI_dest_field( | 75 | __default_send_IPI_dest_field( |
74 | apic->cpu_to_logical_apicid(query_cpu), vector, | 76 | early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), |
75 | apic->dest_logical); | 77 | vector, apic->dest_logical); |
76 | local_irq_restore(flags); | 78 | local_irq_restore(flags); |
77 | } | 79 | } |
78 | 80 | ||
@@ -90,14 +92,12 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, | |||
90 | if (query_cpu == this_cpu) | 92 | if (query_cpu == this_cpu) |
91 | continue; | 93 | continue; |
92 | __default_send_IPI_dest_field( | 94 | __default_send_IPI_dest_field( |
93 | apic->cpu_to_logical_apicid(query_cpu), vector, | 95 | early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), |
94 | apic->dest_logical); | 96 | vector, apic->dest_logical); |
95 | } | 97 | } |
96 | local_irq_restore(flags); | 98 | local_irq_restore(flags); |
97 | } | 99 | } |
98 | 100 | ||
99 | #ifdef CONFIG_X86_32 | ||
100 | |||
101 | /* | 101 | /* |
102 | * This is only used on smaller machines. | 102 | * This is only used on smaller machines. |
103 | */ | 103 | */ |
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 960f26ab5c9f..6273eee5134b 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c | |||
@@ -373,13 +373,6 @@ static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask | |||
373 | return physids_promote(0xFUL, retmap); | 373 | return physids_promote(0xFUL, retmap); |
374 | } | 374 | } |
375 | 375 | ||
376 | static inline int numaq_cpu_to_logical_apicid(int cpu) | ||
377 | { | ||
378 | if (cpu >= nr_cpu_ids) | ||
379 | return BAD_APICID; | ||
380 | return cpu_2_logical_apicid[cpu]; | ||
381 | } | ||
382 | |||
383 | /* | 376 | /* |
384 | * Supporting over 60 cpus on NUMA-Q requires a locality-dependent | 377 | * Supporting over 60 cpus on NUMA-Q requires a locality-dependent |
385 | * cpu to APIC ID relation to properly interact with the intelligent | 378 | * cpu to APIC ID relation to properly interact with the intelligent |
@@ -398,6 +391,15 @@ static inline int numaq_apicid_to_node(int logical_apicid) | |||
398 | return logical_apicid >> 4; | 391 | return logical_apicid >> 4; |
399 | } | 392 | } |
400 | 393 | ||
394 | static int numaq_numa_cpu_node(int cpu) | ||
395 | { | ||
396 | int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); | ||
397 | |||
398 | if (logical_apicid != BAD_APICID) | ||
399 | return numaq_apicid_to_node(logical_apicid); | ||
400 | return NUMA_NO_NODE; | ||
401 | } | ||
402 | |||
401 | static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) | 403 | static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) |
402 | { | 404 | { |
403 | int node = numaq_apicid_to_node(logical_apicid); | 405 | int node = numaq_apicid_to_node(logical_apicid); |
@@ -508,8 +510,6 @@ struct apic __refdata apic_numaq = { | |||
508 | .ioapic_phys_id_map = numaq_ioapic_phys_id_map, | 510 | .ioapic_phys_id_map = numaq_ioapic_phys_id_map, |
509 | .setup_apic_routing = numaq_setup_apic_routing, | 511 | .setup_apic_routing = numaq_setup_apic_routing, |
510 | .multi_timer_check = numaq_multi_timer_check, | 512 | .multi_timer_check = numaq_multi_timer_check, |
511 | .apicid_to_node = numaq_apicid_to_node, | ||
512 | .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid, | ||
513 | .cpu_present_to_apicid = numaq_cpu_present_to_apicid, | 513 | .cpu_present_to_apicid = numaq_cpu_present_to_apicid, |
514 | .apicid_to_cpu_present = numaq_apicid_to_cpu_present, | 514 | .apicid_to_cpu_present = numaq_apicid_to_cpu_present, |
515 | .setup_portio_remap = numaq_setup_portio_remap, | 515 | .setup_portio_remap = numaq_setup_portio_remap, |
@@ -547,4 +547,7 @@ struct apic __refdata apic_numaq = { | |||
547 | .icr_write = native_apic_icr_write, | 547 | .icr_write = native_apic_icr_write, |
548 | .wait_icr_idle = native_apic_wait_icr_idle, | 548 | .wait_icr_idle = native_apic_wait_icr_idle, |
549 | .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, | 549 | .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, |
550 | |||
551 | .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, | ||
552 | .x86_32_numa_cpu_node = numaq_numa_cpu_node, | ||
550 | }; | 553 | }; |
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 99d2fe016084..fc84c7b61108 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c | |||
@@ -77,6 +77,11 @@ void __init default_setup_apic_routing(void) | |||
77 | apic->setup_apic_routing(); | 77 | apic->setup_apic_routing(); |
78 | } | 78 | } |
79 | 79 | ||
80 | static int default_x86_32_early_logical_apicid(int cpu) | ||
81 | { | ||
82 | return 1 << cpu; | ||
83 | } | ||
84 | |||
80 | static void setup_apic_flat_routing(void) | 85 | static void setup_apic_flat_routing(void) |
81 | { | 86 | { |
82 | #ifdef CONFIG_X86_IO_APIC | 87 | #ifdef CONFIG_X86_IO_APIC |
@@ -130,8 +135,6 @@ struct apic apic_default = { | |||
130 | .ioapic_phys_id_map = default_ioapic_phys_id_map, | 135 | .ioapic_phys_id_map = default_ioapic_phys_id_map, |
131 | .setup_apic_routing = setup_apic_flat_routing, | 136 | .setup_apic_routing = setup_apic_flat_routing, |
132 | .multi_timer_check = NULL, | 137 | .multi_timer_check = NULL, |
133 | .apicid_to_node = default_apicid_to_node, | ||
134 | .cpu_to_logical_apicid = default_cpu_to_logical_apicid, | ||
135 | .cpu_present_to_apicid = default_cpu_present_to_apicid, | 138 | .cpu_present_to_apicid = default_cpu_present_to_apicid, |
136 | .apicid_to_cpu_present = physid_set_mask_of_physid, | 139 | .apicid_to_cpu_present = physid_set_mask_of_physid, |
137 | .setup_portio_remap = NULL, | 140 | .setup_portio_remap = NULL, |
@@ -167,6 +170,9 @@ struct apic apic_default = { | |||
167 | .icr_write = native_apic_icr_write, | 170 | .icr_write = native_apic_icr_write, |
168 | .wait_icr_idle = native_apic_wait_icr_idle, | 171 | .wait_icr_idle = native_apic_wait_icr_idle, |
169 | .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, | 172 | .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, |
173 | |||
174 | .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, | ||
175 | .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, | ||
170 | }; | 176 | }; |
171 | 177 | ||
172 | extern struct apic apic_numaq; | 178 | extern struct apic apic_numaq; |
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 9b419263d90d..e4b8059b414a 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c | |||
@@ -194,11 +194,10 @@ static unsigned long summit_check_apicid_present(int bit) | |||
194 | return 1; | 194 | return 1; |
195 | } | 195 | } |
196 | 196 | ||
197 | static void summit_init_apic_ldr(void) | 197 | static int summit_early_logical_apicid(int cpu) |
198 | { | 198 | { |
199 | unsigned long val, id; | ||
200 | int count = 0; | 199 | int count = 0; |
201 | u8 my_id = (u8)hard_smp_processor_id(); | 200 | u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu); |
202 | u8 my_cluster = APIC_CLUSTER(my_id); | 201 | u8 my_cluster = APIC_CLUSTER(my_id); |
203 | #ifdef CONFIG_SMP | 202 | #ifdef CONFIG_SMP |
204 | u8 lid; | 203 | u8 lid; |
@@ -206,7 +205,7 @@ static void summit_init_apic_ldr(void) | |||
206 | 205 | ||
207 | /* Create logical APIC IDs by counting CPUs already in cluster. */ | 206 | /* Create logical APIC IDs by counting CPUs already in cluster. */ |
208 | for (count = 0, i = nr_cpu_ids; --i >= 0; ) { | 207 | for (count = 0, i = nr_cpu_ids; --i >= 0; ) { |
209 | lid = cpu_2_logical_apicid[i]; | 208 | lid = early_per_cpu(x86_cpu_to_logical_apicid, i); |
210 | if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) | 209 | if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) |
211 | ++count; | 210 | ++count; |
212 | } | 211 | } |
@@ -214,7 +213,15 @@ static void summit_init_apic_ldr(void) | |||
214 | /* We only have a 4 wide bitmap in cluster mode. If a deranged | 213 | /* We only have a 4 wide bitmap in cluster mode. If a deranged |
215 | * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ | 214 | * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ |
216 | BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); | 215 | BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); |
217 | id = my_cluster | (1UL << count); | 216 | return my_cluster | (1UL << count); |
217 | } | ||
218 | |||
219 | static void summit_init_apic_ldr(void) | ||
220 | { | ||
221 | int cpu = smp_processor_id(); | ||
222 | unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu); | ||
223 | unsigned long val; | ||
224 | |||
218 | apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); | 225 | apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); |
219 | val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; | 226 | val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; |
220 | val |= SET_APIC_LOGICAL_ID(id); | 227 | val |= SET_APIC_LOGICAL_ID(id); |
@@ -232,27 +239,6 @@ static void summit_setup_apic_routing(void) | |||
232 | nr_ioapics); | 239 | nr_ioapics); |
233 | } | 240 | } |
234 | 241 | ||
235 | static int summit_apicid_to_node(int logical_apicid) | ||
236 | { | ||
237 | #ifdef CONFIG_SMP | ||
238 | return apicid_2_node[hard_smp_processor_id()]; | ||
239 | #else | ||
240 | return 0; | ||
241 | #endif | ||
242 | } | ||
243 | |||
244 | /* Mapping from cpu number to logical apicid */ | ||
245 | static inline int summit_cpu_to_logical_apicid(int cpu) | ||
246 | { | ||
247 | #ifdef CONFIG_SMP | ||
248 | if (cpu >= nr_cpu_ids) | ||
249 | return BAD_APICID; | ||
250 | return cpu_2_logical_apicid[cpu]; | ||
251 | #else | ||
252 | return logical_smp_processor_id(); | ||
253 | #endif | ||
254 | } | ||
255 | |||
256 | static int summit_cpu_present_to_apicid(int mps_cpu) | 242 | static int summit_cpu_present_to_apicid(int mps_cpu) |
257 | { | 243 | { |
258 | if (mps_cpu < nr_cpu_ids) | 244 | if (mps_cpu < nr_cpu_ids) |
@@ -286,7 +272,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) | |||
286 | * The cpus in the mask must all be on the apic cluster. | 272 | * The cpus in the mask must all be on the apic cluster. |
287 | */ | 273 | */ |
288 | for_each_cpu(cpu, cpumask) { | 274 | for_each_cpu(cpu, cpumask) { |
289 | int new_apicid = summit_cpu_to_logical_apicid(cpu); | 275 | int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); |
290 | 276 | ||
291 | if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { | 277 | if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { |
292 | printk("%s: Not a valid mask!\n", __func__); | 278 | printk("%s: Not a valid mask!\n", __func__); |
@@ -301,7 +287,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) | |||
301 | static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, | 287 | static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, |
302 | const struct cpumask *andmask) | 288 | const struct cpumask *andmask) |
303 | { | 289 | { |
304 | int apicid = summit_cpu_to_logical_apicid(0); | 290 | int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); |
305 | cpumask_var_t cpumask; | 291 | cpumask_var_t cpumask; |
306 | 292 | ||
307 | if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) | 293 | if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) |
@@ -528,8 +514,6 @@ struct apic apic_summit = { | |||
528 | .ioapic_phys_id_map = summit_ioapic_phys_id_map, | 514 | .ioapic_phys_id_map = summit_ioapic_phys_id_map, |
529 | .setup_apic_routing = summit_setup_apic_routing, | 515 | .setup_apic_routing = summit_setup_apic_routing, |
530 | .multi_timer_check = NULL, | 516 | .multi_timer_check = NULL, |
531 | .apicid_to_node = summit_apicid_to_node, | ||
532 | .cpu_to_logical_apicid = summit_cpu_to_logical_apicid, | ||
533 | .cpu_present_to_apicid = summit_cpu_present_to_apicid, | 517 | .cpu_present_to_apicid = summit_cpu_present_to_apicid, |
534 | .apicid_to_cpu_present = summit_apicid_to_cpu_present, | 518 | .apicid_to_cpu_present = summit_apicid_to_cpu_present, |
535 | .setup_portio_remap = NULL, | 519 | .setup_portio_remap = NULL, |
@@ -565,4 +549,7 @@ struct apic apic_summit = { | |||
565 | .icr_write = native_apic_icr_write, | 549 | .icr_write = native_apic_icr_write, |
566 | .wait_icr_idle = native_apic_wait_icr_idle, | 550 | .wait_icr_idle = native_apic_wait_icr_idle, |
567 | .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, | 551 | .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, |
552 | |||
553 | .x86_32_early_logical_apicid = summit_early_logical_apicid, | ||
554 | .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node, | ||
568 | }; | 555 | }; |
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index cf69c59f4910..90949bbd566d 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c | |||
@@ -206,8 +206,6 @@ struct apic apic_x2apic_cluster = { | |||
206 | .ioapic_phys_id_map = NULL, | 206 | .ioapic_phys_id_map = NULL, |
207 | .setup_apic_routing = NULL, | 207 | .setup_apic_routing = NULL, |
208 | .multi_timer_check = NULL, | 208 | .multi_timer_check = NULL, |
209 | .apicid_to_node = NULL, | ||
210 | .cpu_to_logical_apicid = NULL, | ||
211 | .cpu_present_to_apicid = default_cpu_present_to_apicid, | 209 | .cpu_present_to_apicid = default_cpu_present_to_apicid, |
212 | .apicid_to_cpu_present = NULL, | 210 | .apicid_to_cpu_present = NULL, |
213 | .setup_portio_remap = NULL, | 211 | .setup_portio_remap = NULL, |
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 8972f38c5ced..c7e6d6645bf4 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c | |||
@@ -195,8 +195,6 @@ struct apic apic_x2apic_phys = { | |||
195 | .ioapic_phys_id_map = NULL, | 195 | .ioapic_phys_id_map = NULL, |
196 | .setup_apic_routing = NULL, | 196 | .setup_apic_routing = NULL, |
197 | .multi_timer_check = NULL, | 197 | .multi_timer_check = NULL, |
198 | .apicid_to_node = NULL, | ||
199 | .cpu_to_logical_apicid = NULL, | ||
200 | .cpu_present_to_apicid = default_cpu_present_to_apicid, | 198 | .cpu_present_to_apicid = default_cpu_present_to_apicid, |
201 | .apicid_to_cpu_present = NULL, | 199 | .apicid_to_cpu_present = NULL, |
202 | .setup_portio_remap = NULL, | 200 | .setup_portio_remap = NULL, |
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index bd16b58b8850..3c289281394c 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c | |||
@@ -338,8 +338,6 @@ struct apic __refdata apic_x2apic_uv_x = { | |||
338 | .ioapic_phys_id_map = NULL, | 338 | .ioapic_phys_id_map = NULL, |
339 | .setup_apic_routing = NULL, | 339 | .setup_apic_routing = NULL, |
340 | .multi_timer_check = NULL, | 340 | .multi_timer_check = NULL, |
341 | .apicid_to_node = NULL, | ||
342 | .cpu_to_logical_apicid = NULL, | ||
343 | .cpu_present_to_apicid = default_cpu_present_to_apicid, | 341 | .cpu_present_to_apicid = default_cpu_present_to_apicid, |
344 | .apicid_to_cpu_present = NULL, | 342 | .apicid_to_cpu_present = NULL, |
345 | .setup_portio_remap = NULL, | 343 | .setup_portio_remap = NULL, |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7c7bedb83c5a..f771ab6b49e9 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -233,18 +233,22 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) | |||
233 | } | 233 | } |
234 | #endif | 234 | #endif |
235 | 235 | ||
236 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) | 236 | #ifdef CONFIG_NUMA |
237 | /* | ||
238 | * To workaround broken NUMA config. Read the comment in | ||
239 | * srat_detect_node(). | ||
240 | */ | ||
237 | static int __cpuinit nearby_node(int apicid) | 241 | static int __cpuinit nearby_node(int apicid) |
238 | { | 242 | { |
239 | int i, node; | 243 | int i, node; |
240 | 244 | ||
241 | for (i = apicid - 1; i >= 0; i--) { | 245 | for (i = apicid - 1; i >= 0; i--) { |
242 | node = apicid_to_node[i]; | 246 | node = __apicid_to_node[i]; |
243 | if (node != NUMA_NO_NODE && node_online(node)) | 247 | if (node != NUMA_NO_NODE && node_online(node)) |
244 | return node; | 248 | return node; |
245 | } | 249 | } |
246 | for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { | 250 | for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { |
247 | node = apicid_to_node[i]; | 251 | node = __apicid_to_node[i]; |
248 | if (node != NUMA_NO_NODE && node_online(node)) | 252 | if (node != NUMA_NO_NODE && node_online(node)) |
249 | return node; | 253 | return node; |
250 | } | 254 | } |
@@ -261,7 +265,7 @@ static int __cpuinit nearby_node(int apicid) | |||
261 | #ifdef CONFIG_X86_HT | 265 | #ifdef CONFIG_X86_HT |
262 | static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) | 266 | static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) |
263 | { | 267 | { |
264 | u32 nodes; | 268 | u32 nodes, cores_per_cu = 1; |
265 | u8 node_id; | 269 | u8 node_id; |
266 | int cpu = smp_processor_id(); | 270 | int cpu = smp_processor_id(); |
267 | 271 | ||
@@ -276,6 +280,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) | |||
276 | /* get compute unit information */ | 280 | /* get compute unit information */ |
277 | smp_num_siblings = ((ebx >> 8) & 3) + 1; | 281 | smp_num_siblings = ((ebx >> 8) & 3) + 1; |
278 | c->compute_unit_id = ebx & 0xff; | 282 | c->compute_unit_id = ebx & 0xff; |
283 | cores_per_cu += ((ebx >> 8) & 3); | ||
279 | } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { | 284 | } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { |
280 | u64 value; | 285 | u64 value; |
281 | 286 | ||
@@ -288,15 +293,18 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) | |||
288 | /* fixup multi-node processor information */ | 293 | /* fixup multi-node processor information */ |
289 | if (nodes > 1) { | 294 | if (nodes > 1) { |
290 | u32 cores_per_node; | 295 | u32 cores_per_node; |
296 | u32 cus_per_node; | ||
291 | 297 | ||
292 | set_cpu_cap(c, X86_FEATURE_AMD_DCM); | 298 | set_cpu_cap(c, X86_FEATURE_AMD_DCM); |
293 | cores_per_node = c->x86_max_cores / nodes; | 299 | cores_per_node = c->x86_max_cores / nodes; |
300 | cus_per_node = cores_per_node / cores_per_cu; | ||
294 | 301 | ||
295 | /* store NodeID, use llc_shared_map to store sibling info */ | 302 | /* store NodeID, use llc_shared_map to store sibling info */ |
296 | per_cpu(cpu_llc_id, cpu) = node_id; | 303 | per_cpu(cpu_llc_id, cpu) = node_id; |
297 | 304 | ||
298 | /* core id to be in range from 0 to (cores_per_node - 1) */ | 305 | /* core id has to be in the [0 .. cores_per_node - 1] range */ |
299 | c->cpu_core_id = c->cpu_core_id % cores_per_node; | 306 | c->cpu_core_id %= cores_per_node; |
307 | c->compute_unit_id %= cus_per_node; | ||
300 | } | 308 | } |
301 | } | 309 | } |
302 | #endif | 310 | #endif |
@@ -334,31 +342,40 @@ EXPORT_SYMBOL_GPL(amd_get_nb_id); | |||
334 | 342 | ||
335 | static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) | 343 | static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) |
336 | { | 344 | { |
337 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) | 345 | #ifdef CONFIG_NUMA |
338 | int cpu = smp_processor_id(); | 346 | int cpu = smp_processor_id(); |
339 | int node; | 347 | int node; |
340 | unsigned apicid = c->apicid; | 348 | unsigned apicid = c->apicid; |
341 | 349 | ||
342 | node = per_cpu(cpu_llc_id, cpu); | 350 | node = numa_cpu_node(cpu); |
351 | if (node == NUMA_NO_NODE) | ||
352 | node = per_cpu(cpu_llc_id, cpu); | ||
343 | 353 | ||
344 | if (apicid_to_node[apicid] != NUMA_NO_NODE) | ||
345 | node = apicid_to_node[apicid]; | ||
346 | if (!node_online(node)) { | 354 | if (!node_online(node)) { |
347 | /* Two possibilities here: | 355 | /* |
348 | - The CPU is missing memory and no node was created. | 356 | * Two possibilities here: |
349 | In that case try picking one from a nearby CPU | 357 | * |
350 | - The APIC IDs differ from the HyperTransport node IDs | 358 | * - The CPU is missing memory and no node was created. In |
351 | which the K8 northbridge parsing fills in. | 359 | * that case try picking one from a nearby CPU. |
352 | Assume they are all increased by a constant offset, | 360 | * |
353 | but in the same order as the HT nodeids. | 361 | * - The APIC IDs differ from the HyperTransport node IDs |
354 | If that doesn't result in a usable node fall back to the | 362 | * which the K8 northbridge parsing fills in. Assume |
355 | path for the previous case. */ | 363 | * they are all increased by a constant offset, but in |
356 | 364 | * the same order as the HT nodeids. If that doesn't | |
365 | * result in a usable node fall back to the path for the | ||
366 | * previous case. | ||
367 | * | ||
368 | * This workaround operates directly on the mapping between | ||
369 | * APIC ID and NUMA node, assuming certain relationship | ||
370 | * between APIC ID, HT node ID and NUMA topology. As going | ||
371 | * through CPU mapping may alter the outcome, directly | ||
372 | * access __apicid_to_node[]. | ||
373 | */ | ||
357 | int ht_nodeid = c->initial_apicid; | 374 | int ht_nodeid = c->initial_apicid; |
358 | 375 | ||
359 | if (ht_nodeid >= 0 && | 376 | if (ht_nodeid >= 0 && |
360 | apicid_to_node[ht_nodeid] != NUMA_NO_NODE) | 377 | __apicid_to_node[ht_nodeid] != NUMA_NO_NODE) |
361 | node = apicid_to_node[ht_nodeid]; | 378 | node = __apicid_to_node[ht_nodeid]; |
362 | /* Pick a nearby node */ | 379 | /* Pick a nearby node */ |
363 | if (!node_online(node)) | 380 | if (!node_online(node)) |
364 | node = nearby_node(apicid); | 381 | node = nearby_node(apicid); |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5d98c46f876d..e2ced0074a45 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -869,7 +869,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) | |||
869 | 869 | ||
870 | select_idle_routine(c); | 870 | select_idle_routine(c); |
871 | 871 | ||
872 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) | 872 | #ifdef CONFIG_NUMA |
873 | numa_add_cpu(smp_processor_id()); | 873 | numa_add_cpu(smp_processor_id()); |
874 | #endif | 874 | #endif |
875 | } | 875 | } |
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index d16c2c53d6bf..df86bc8c859d 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
@@ -276,14 +276,13 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) | |||
276 | 276 | ||
277 | static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) | 277 | static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) |
278 | { | 278 | { |
279 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) | 279 | #ifdef CONFIG_NUMA |
280 | unsigned node; | 280 | unsigned node; |
281 | int cpu = smp_processor_id(); | 281 | int cpu = smp_processor_id(); |
282 | int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; | ||
283 | 282 | ||
284 | /* Don't do the funky fallback heuristics the AMD version employs | 283 | /* Don't do the funky fallback heuristics the AMD version employs |
285 | for now. */ | 284 | for now. */ |
286 | node = apicid_to_node[apicid]; | 285 | node = numa_cpu_node(cpu); |
287 | if (node == NUMA_NO_NODE || !node_online(node)) { | 286 | if (node == NUMA_NO_NODE || !node_online(node)) { |
288 | /* reuse the value from init_cpu_to_node() */ | 287 | /* reuse the value from init_cpu_to_node() */ |
289 | node = cpu_to_node(cpu); | 288 | node = cpu_to_node(cpu); |
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 5419a263ebd1..1ce1af2899df 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c | |||
@@ -304,8 +304,9 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, | |||
304 | 304 | ||
305 | struct _cache_attr { | 305 | struct _cache_attr { |
306 | struct attribute attr; | 306 | struct attribute attr; |
307 | ssize_t (*show)(struct _cpuid4_info *, char *); | 307 | ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int); |
308 | ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); | 308 | ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count, |
309 | unsigned int); | ||
309 | }; | 310 | }; |
310 | 311 | ||
311 | #ifdef CONFIG_AMD_NB | 312 | #ifdef CONFIG_AMD_NB |
@@ -400,7 +401,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, | |||
400 | 401 | ||
401 | #define SHOW_CACHE_DISABLE(slot) \ | 402 | #define SHOW_CACHE_DISABLE(slot) \ |
402 | static ssize_t \ | 403 | static ssize_t \ |
403 | show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \ | 404 | show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \ |
405 | unsigned int cpu) \ | ||
404 | { \ | 406 | { \ |
405 | return show_cache_disable(this_leaf, buf, slot); \ | 407 | return show_cache_disable(this_leaf, buf, slot); \ |
406 | } | 408 | } |
@@ -512,7 +514,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, | |||
512 | #define STORE_CACHE_DISABLE(slot) \ | 514 | #define STORE_CACHE_DISABLE(slot) \ |
513 | static ssize_t \ | 515 | static ssize_t \ |
514 | store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ | 516 | store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ |
515 | const char *buf, size_t count) \ | 517 | const char *buf, size_t count, \ |
518 | unsigned int cpu) \ | ||
516 | { \ | 519 | { \ |
517 | return store_cache_disable(this_leaf, buf, count, slot); \ | 520 | return store_cache_disable(this_leaf, buf, count, slot); \ |
518 | } | 521 | } |
@@ -524,6 +527,39 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, | |||
524 | static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, | 527 | static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, |
525 | show_cache_disable_1, store_cache_disable_1); | 528 | show_cache_disable_1, store_cache_disable_1); |
526 | 529 | ||
530 | static ssize_t | ||
531 | show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) | ||
532 | { | ||
533 | if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) | ||
534 | return -EINVAL; | ||
535 | |||
536 | return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); | ||
537 | } | ||
538 | |||
539 | static ssize_t | ||
540 | store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, | ||
541 | unsigned int cpu) | ||
542 | { | ||
543 | unsigned long val; | ||
544 | |||
545 | if (!capable(CAP_SYS_ADMIN)) | ||
546 | return -EPERM; | ||
547 | |||
548 | if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) | ||
549 | return -EINVAL; | ||
550 | |||
551 | if (strict_strtoul(buf, 16, &val) < 0) | ||
552 | return -EINVAL; | ||
553 | |||
554 | if (amd_set_subcaches(cpu, val)) | ||
555 | return -EINVAL; | ||
556 | |||
557 | return count; | ||
558 | } | ||
559 | |||
560 | static struct _cache_attr subcaches = | ||
561 | __ATTR(subcaches, 0644, show_subcaches, store_subcaches); | ||
562 | |||
527 | #else /* CONFIG_AMD_NB */ | 563 | #else /* CONFIG_AMD_NB */ |
528 | #define amd_init_l3_cache(x, y) | 564 | #define amd_init_l3_cache(x, y) |
529 | #endif /* CONFIG_AMD_NB */ | 565 | #endif /* CONFIG_AMD_NB */ |
@@ -532,9 +568,9 @@ static int | |||
532 | __cpuinit cpuid4_cache_lookup_regs(int index, | 568 | __cpuinit cpuid4_cache_lookup_regs(int index, |
533 | struct _cpuid4_info_regs *this_leaf) | 569 | struct _cpuid4_info_regs *this_leaf) |
534 | { | 570 | { |
535 | union _cpuid4_leaf_eax eax; | 571 | union _cpuid4_leaf_eax eax; |
536 | union _cpuid4_leaf_ebx ebx; | 572 | union _cpuid4_leaf_ebx ebx; |
537 | union _cpuid4_leaf_ecx ecx; | 573 | union _cpuid4_leaf_ecx ecx; |
538 | unsigned edx; | 574 | unsigned edx; |
539 | 575 | ||
540 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { | 576 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { |
@@ -870,8 +906,8 @@ static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject); | |||
870 | #define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) | 906 | #define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) |
871 | 907 | ||
872 | #define show_one_plus(file_name, object, val) \ | 908 | #define show_one_plus(file_name, object, val) \ |
873 | static ssize_t show_##file_name \ | 909 | static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \ |
874 | (struct _cpuid4_info *this_leaf, char *buf) \ | 910 | unsigned int cpu) \ |
875 | { \ | 911 | { \ |
876 | return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ | 912 | return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ |
877 | } | 913 | } |
@@ -882,7 +918,8 @@ show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1); | |||
882 | show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); | 918 | show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); |
883 | show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); | 919 | show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); |
884 | 920 | ||
885 | static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) | 921 | static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, |
922 | unsigned int cpu) | ||
886 | { | 923 | { |
887 | return sprintf(buf, "%luK\n", this_leaf->size / 1024); | 924 | return sprintf(buf, "%luK\n", this_leaf->size / 1024); |
888 | } | 925 | } |
@@ -906,17 +943,20 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, | |||
906 | return n; | 943 | return n; |
907 | } | 944 | } |
908 | 945 | ||
909 | static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf) | 946 | static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf, |
947 | unsigned int cpu) | ||
910 | { | 948 | { |
911 | return show_shared_cpu_map_func(leaf, 0, buf); | 949 | return show_shared_cpu_map_func(leaf, 0, buf); |
912 | } | 950 | } |
913 | 951 | ||
914 | static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf) | 952 | static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf, |
953 | unsigned int cpu) | ||
915 | { | 954 | { |
916 | return show_shared_cpu_map_func(leaf, 1, buf); | 955 | return show_shared_cpu_map_func(leaf, 1, buf); |
917 | } | 956 | } |
918 | 957 | ||
919 | static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) | 958 | static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, |
959 | unsigned int cpu) | ||
920 | { | 960 | { |
921 | switch (this_leaf->eax.split.type) { | 961 | switch (this_leaf->eax.split.type) { |
922 | case CACHE_TYPE_DATA: | 962 | case CACHE_TYPE_DATA: |
@@ -974,6 +1014,9 @@ static struct attribute ** __cpuinit amd_l3_attrs(void) | |||
974 | if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) | 1014 | if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) |
975 | n += 2; | 1015 | n += 2; |
976 | 1016 | ||
1017 | if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) | ||
1018 | n += 1; | ||
1019 | |||
977 | attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); | 1020 | attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); |
978 | if (attrs == NULL) | 1021 | if (attrs == NULL) |
979 | return attrs = default_attrs; | 1022 | return attrs = default_attrs; |
@@ -986,6 +1029,9 @@ static struct attribute ** __cpuinit amd_l3_attrs(void) | |||
986 | attrs[n++] = &cache_disable_1.attr; | 1029 | attrs[n++] = &cache_disable_1.attr; |
987 | } | 1030 | } |
988 | 1031 | ||
1032 | if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) | ||
1033 | attrs[n++] = &subcaches.attr; | ||
1034 | |||
989 | return attrs; | 1035 | return attrs; |
990 | } | 1036 | } |
991 | #endif | 1037 | #endif |
@@ -998,7 +1044,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) | |||
998 | 1044 | ||
999 | ret = fattr->show ? | 1045 | ret = fattr->show ? |
1000 | fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), | 1046 | fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), |
1001 | buf) : | 1047 | buf, this_leaf->cpu) : |
1002 | 0; | 1048 | 0; |
1003 | return ret; | 1049 | return ret; |
1004 | } | 1050 | } |
@@ -1012,7 +1058,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, | |||
1012 | 1058 | ||
1013 | ret = fattr->store ? | 1059 | ret = fattr->store ? |
1014 | fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), | 1060 | fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), |
1015 | buf, count) : | 1061 | buf, count, this_leaf->cpu) : |
1016 | 0; | 1062 | 0; |
1017 | return ret; | 1063 | return ret; |
1018 | } | 1064 | } |
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 294f26da0c0c..0b5e2b546566 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -847,15 +847,21 @@ static int __init parse_memopt(char *p) | |||
847 | if (!p) | 847 | if (!p) |
848 | return -EINVAL; | 848 | return -EINVAL; |
849 | 849 | ||
850 | #ifdef CONFIG_X86_32 | ||
851 | if (!strcmp(p, "nopentium")) { | 850 | if (!strcmp(p, "nopentium")) { |
851 | #ifdef CONFIG_X86_32 | ||
852 | setup_clear_cpu_cap(X86_FEATURE_PSE); | 852 | setup_clear_cpu_cap(X86_FEATURE_PSE); |
853 | return 0; | 853 | return 0; |
854 | } | 854 | #else |
855 | printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n"); | ||
856 | return -EINVAL; | ||
855 | #endif | 857 | #endif |
858 | } | ||
856 | 859 | ||
857 | userdef = 1; | 860 | userdef = 1; |
858 | mem_size = memparse(p, &p); | 861 | mem_size = memparse(p, &p); |
862 | /* don't remove all of memory when handling "mem={invalid}" param */ | ||
863 | if (mem_size == 0) | ||
864 | return -EINVAL; | ||
859 | e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); | 865 | e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); |
860 | 866 | ||
861 | return 0; | 867 | return 0; |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c32cbbcff7b9..b72b4a6466a9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -977,9 +977,12 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ | |||
977 | x86_platform_ipi smp_x86_platform_ipi | 977 | x86_platform_ipi smp_x86_platform_ipi |
978 | 978 | ||
979 | #ifdef CONFIG_SMP | 979 | #ifdef CONFIG_SMP |
980 | .irpc idx, "01234567" | 980 | .irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ |
981 | 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 | ||
982 | .if NUM_INVALIDATE_TLB_VECTORS > \idx | ||
981 | apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ | 983 | apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ |
982 | invalidate_interrupt\idx smp_invalidate_interrupt | 984 | invalidate_interrupt\idx smp_invalidate_interrupt |
985 | .endif | ||
983 | .endr | 986 | .endr |
984 | #endif | 987 | #endif |
985 | 988 | ||
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 1cc302d16fb4..d30854b18d25 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c | |||
@@ -166,14 +166,77 @@ static void __init smp_intr_init(void) | |||
166 | alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); | 166 | alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); |
167 | 167 | ||
168 | /* IPIs for invalidation */ | 168 | /* IPIs for invalidation */ |
169 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); | 169 | #define ALLOC_INVTLB_VEC(NR) \ |
170 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); | 170 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \ |
171 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); | 171 | invalidate_interrupt##NR) |
172 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); | 172 | |
173 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); | 173 | switch (NUM_INVALIDATE_TLB_VECTORS) { |
174 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); | 174 | default: |
175 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); | 175 | ALLOC_INVTLB_VEC(31); |
176 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); | 176 | case 31: |
177 | ALLOC_INVTLB_VEC(30); | ||
178 | case 30: | ||
179 | ALLOC_INVTLB_VEC(29); | ||
180 | case 29: | ||
181 | ALLOC_INVTLB_VEC(28); | ||
182 | case 28: | ||
183 | ALLOC_INVTLB_VEC(27); | ||
184 | case 27: | ||
185 | ALLOC_INVTLB_VEC(26); | ||
186 | case 26: | ||
187 | ALLOC_INVTLB_VEC(25); | ||
188 | case 25: | ||
189 | ALLOC_INVTLB_VEC(24); | ||
190 | case 24: | ||
191 | ALLOC_INVTLB_VEC(23); | ||
192 | case 23: | ||
193 | ALLOC_INVTLB_VEC(22); | ||
194 | case 22: | ||
195 | ALLOC_INVTLB_VEC(21); | ||
196 | case 21: | ||
197 | ALLOC_INVTLB_VEC(20); | ||
198 | case 20: | ||
199 | ALLOC_INVTLB_VEC(19); | ||
200 | case 19: | ||
201 | ALLOC_INVTLB_VEC(18); | ||
202 | case 18: | ||
203 | ALLOC_INVTLB_VEC(17); | ||
204 | case 17: | ||
205 | ALLOC_INVTLB_VEC(16); | ||
206 | case 16: | ||
207 | ALLOC_INVTLB_VEC(15); | ||
208 | case 15: | ||
209 | ALLOC_INVTLB_VEC(14); | ||
210 | case 14: | ||
211 | ALLOC_INVTLB_VEC(13); | ||
212 | case 13: | ||
213 | ALLOC_INVTLB_VEC(12); | ||
214 | case 12: | ||
215 | ALLOC_INVTLB_VEC(11); | ||
216 | case 11: | ||
217 | ALLOC_INVTLB_VEC(10); | ||
218 | case 10: | ||
219 | ALLOC_INVTLB_VEC(9); | ||
220 | case 9: | ||
221 | ALLOC_INVTLB_VEC(8); | ||
222 | case 8: | ||
223 | ALLOC_INVTLB_VEC(7); | ||
224 | case 7: | ||
225 | ALLOC_INVTLB_VEC(6); | ||
226 | case 6: | ||
227 | ALLOC_INVTLB_VEC(5); | ||
228 | case 5: | ||
229 | ALLOC_INVTLB_VEC(4); | ||
230 | case 4: | ||
231 | ALLOC_INVTLB_VEC(3); | ||
232 | case 3: | ||
233 | ALLOC_INVTLB_VEC(2); | ||
234 | case 2: | ||
235 | ALLOC_INVTLB_VEC(1); | ||
236 | case 1: | ||
237 | ALLOC_INVTLB_VEC(0); | ||
238 | break; | ||
239 | } | ||
177 | 240 | ||
178 | /* IPI for generic function call */ | 241 | /* IPI for generic function call */ |
179 | alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); | 242 | alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d3cfe26c0252..c3a606c41ce0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -293,10 +293,32 @@ static void __init init_gbpages(void) | |||
293 | else | 293 | else |
294 | direct_gbpages = 0; | 294 | direct_gbpages = 0; |
295 | } | 295 | } |
296 | |||
297 | static void __init cleanup_highmap_brk_end(void) | ||
298 | { | ||
299 | pud_t *pud; | ||
300 | pmd_t *pmd; | ||
301 | |||
302 | mmu_cr4_features = read_cr4(); | ||
303 | |||
304 | /* | ||
305 | * _brk_end cannot change anymore, but it and _end may be | ||
306 | * located on different 2M pages. cleanup_highmap(), however, | ||
307 | * can only consider _end when it runs, so destroy any | ||
308 | * mappings beyond _brk_end here. | ||
309 | */ | ||
310 | pud = pud_offset(pgd_offset_k(_brk_end), _brk_end); | ||
311 | pmd = pmd_offset(pud, _brk_end - 1); | ||
312 | while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) | ||
313 | pmd_clear(pmd); | ||
314 | } | ||
296 | #else | 315 | #else |
297 | static inline void init_gbpages(void) | 316 | static inline void init_gbpages(void) |
298 | { | 317 | { |
299 | } | 318 | } |
319 | static inline void cleanup_highmap_brk_end(void) | ||
320 | { | ||
321 | } | ||
300 | #endif | 322 | #endif |
301 | 323 | ||
302 | static void __init reserve_brk(void) | 324 | static void __init reserve_brk(void) |
@@ -307,6 +329,8 @@ static void __init reserve_brk(void) | |||
307 | /* Mark brk area as locked down and no longer taking any | 329 | /* Mark brk area as locked down and no longer taking any |
308 | new allocations */ | 330 | new allocations */ |
309 | _brk_start = 0; | 331 | _brk_start = 0; |
332 | |||
333 | cleanup_highmap_brk_end(); | ||
310 | } | 334 | } |
311 | 335 | ||
312 | #ifdef CONFIG_BLK_DEV_INITRD | 336 | #ifdef CONFIG_BLK_DEV_INITRD |
@@ -680,15 +704,6 @@ static int __init parse_reservelow(char *p) | |||
680 | 704 | ||
681 | early_param("reservelow", parse_reservelow); | 705 | early_param("reservelow", parse_reservelow); |
682 | 706 | ||
683 | static u64 __init get_max_mapped(void) | ||
684 | { | ||
685 | u64 end = max_pfn_mapped; | ||
686 | |||
687 | end <<= PAGE_SHIFT; | ||
688 | |||
689 | return end; | ||
690 | } | ||
691 | |||
692 | /* | 707 | /* |
693 | * Determine if we were loaded by an EFI loader. If so, then we have also been | 708 | * Determine if we were loaded by an EFI loader. If so, then we have also been |
694 | * passed the efi memmap, systab, etc., so we should use these data structures | 709 | * passed the efi memmap, systab, etc., so we should use these data structures |
@@ -704,8 +719,6 @@ static u64 __init get_max_mapped(void) | |||
704 | 719 | ||
705 | void __init setup_arch(char **cmdline_p) | 720 | void __init setup_arch(char **cmdline_p) |
706 | { | 721 | { |
707 | int acpi = 0; | ||
708 | int amd = 0; | ||
709 | unsigned long flags; | 722 | unsigned long flags; |
710 | 723 | ||
711 | #ifdef CONFIG_X86_32 | 724 | #ifdef CONFIG_X86_32 |
@@ -984,19 +997,7 @@ void __init setup_arch(char **cmdline_p) | |||
984 | 997 | ||
985 | early_acpi_boot_init(); | 998 | early_acpi_boot_init(); |
986 | 999 | ||
987 | #ifdef CONFIG_ACPI_NUMA | 1000 | initmem_init(); |
988 | /* | ||
989 | * Parse SRAT to discover nodes. | ||
990 | */ | ||
991 | acpi = acpi_numa_init(); | ||
992 | #endif | ||
993 | |||
994 | #ifdef CONFIG_AMD_NUMA | ||
995 | if (!acpi) | ||
996 | amd = !amd_numa_init(0, max_pfn); | ||
997 | #endif | ||
998 | |||
999 | initmem_init(0, max_pfn, acpi, amd); | ||
1000 | memblock_find_dma_reserve(); | 1001 | memblock_find_dma_reserve(); |
1001 | dma32_reserve_bootmem(); | 1002 | dma32_reserve_bootmem(); |
1002 | 1003 | ||
@@ -1040,9 +1041,7 @@ void __init setup_arch(char **cmdline_p) | |||
1040 | 1041 | ||
1041 | prefill_possible_map(); | 1042 | prefill_possible_map(); |
1042 | 1043 | ||
1043 | #ifdef CONFIG_X86_64 | ||
1044 | init_cpu_to_node(); | 1044 | init_cpu_to_node(); |
1045 | #endif | ||
1046 | 1045 | ||
1047 | init_apic_mappings(); | 1046 | init_apic_mappings(); |
1048 | ioapic_and_gsi_init(); | 1047 | ioapic_and_gsi_init(); |
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 002b79685f73..71f4727da373 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c | |||
@@ -225,10 +225,15 @@ void __init setup_per_cpu_areas(void) | |||
225 | per_cpu(x86_bios_cpu_apicid, cpu) = | 225 | per_cpu(x86_bios_cpu_apicid, cpu) = |
226 | early_per_cpu_map(x86_bios_cpu_apicid, cpu); | 226 | early_per_cpu_map(x86_bios_cpu_apicid, cpu); |
227 | #endif | 227 | #endif |
228 | #ifdef CONFIG_X86_32 | ||
229 | per_cpu(x86_cpu_to_logical_apicid, cpu) = | ||
230 | early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); | ||
231 | #endif | ||
228 | #ifdef CONFIG_X86_64 | 232 | #ifdef CONFIG_X86_64 |
229 | per_cpu(irq_stack_ptr, cpu) = | 233 | per_cpu(irq_stack_ptr, cpu) = |
230 | per_cpu(irq_stack_union.irq_stack, cpu) + | 234 | per_cpu(irq_stack_union.irq_stack, cpu) + |
231 | IRQ_STACK_SIZE - 64; | 235 | IRQ_STACK_SIZE - 64; |
236 | #endif | ||
232 | #ifdef CONFIG_NUMA | 237 | #ifdef CONFIG_NUMA |
233 | per_cpu(x86_cpu_to_node_map, cpu) = | 238 | per_cpu(x86_cpu_to_node_map, cpu) = |
234 | early_per_cpu_map(x86_cpu_to_node_map, cpu); | 239 | early_per_cpu_map(x86_cpu_to_node_map, cpu); |
@@ -242,7 +247,6 @@ void __init setup_per_cpu_areas(void) | |||
242 | */ | 247 | */ |
243 | set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); | 248 | set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); |
244 | #endif | 249 | #endif |
245 | #endif | ||
246 | /* | 250 | /* |
247 | * Up to this point, the boot CPU has been using .init.data | 251 | * Up to this point, the boot CPU has been using .init.data |
248 | * area. Reload any changed state for the boot CPU. | 252 | * area. Reload any changed state for the boot CPU. |
@@ -256,7 +260,10 @@ void __init setup_per_cpu_areas(void) | |||
256 | early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; | 260 | early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; |
257 | early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; | 261 | early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; |
258 | #endif | 262 | #endif |
259 | #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) | 263 | #ifdef CONFIG_X86_32 |
264 | early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL; | ||
265 | #endif | ||
266 | #ifdef CONFIG_NUMA | ||
260 | early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; | 267 | early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; |
261 | #endif | 268 | #endif |
262 | 269 | ||
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 16ce42613991..e9efdfd51c8d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -72,10 +72,6 @@ | |||
72 | #include <asm/smpboot_hooks.h> | 72 | #include <asm/smpboot_hooks.h> |
73 | #include <asm/i8259.h> | 73 | #include <asm/i8259.h> |
74 | 74 | ||
75 | #ifdef CONFIG_X86_32 | ||
76 | u8 apicid_2_node[MAX_APICID]; | ||
77 | #endif | ||
78 | |||
79 | /* State of each CPU */ | 75 | /* State of each CPU */ |
80 | DEFINE_PER_CPU(int, cpu_state) = { 0 }; | 76 | DEFINE_PER_CPU(int, cpu_state) = { 0 }; |
81 | 77 | ||
@@ -139,62 +135,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); | |||
139 | 135 | ||
140 | atomic_t init_deasserted; | 136 | atomic_t init_deasserted; |
141 | 137 | ||
142 | #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) | ||
143 | /* which node each logical CPU is on */ | ||
144 | int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; | ||
145 | EXPORT_SYMBOL(cpu_to_node_map); | ||
146 | |||
147 | /* set up a mapping between cpu and node. */ | ||
148 | static void map_cpu_to_node(int cpu, int node) | ||
149 | { | ||
150 | printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node); | ||
151 | cpumask_set_cpu(cpu, node_to_cpumask_map[node]); | ||
152 | cpu_to_node_map[cpu] = node; | ||
153 | } | ||
154 | |||
155 | /* undo a mapping between cpu and node. */ | ||
156 | static void unmap_cpu_to_node(int cpu) | ||
157 | { | ||
158 | int node; | ||
159 | |||
160 | printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu); | ||
161 | for (node = 0; node < MAX_NUMNODES; node++) | ||
162 | cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); | ||
163 | cpu_to_node_map[cpu] = 0; | ||
164 | } | ||
165 | #else /* !(CONFIG_NUMA && CONFIG_X86_32) */ | ||
166 | #define map_cpu_to_node(cpu, node) ({}) | ||
167 | #define unmap_cpu_to_node(cpu) ({}) | ||
168 | #endif | ||
169 | |||
170 | #ifdef CONFIG_X86_32 | ||
171 | static int boot_cpu_logical_apicid; | ||
172 | |||
173 | u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = | ||
174 | { [0 ... NR_CPUS-1] = BAD_APICID }; | ||
175 | |||
176 | static void map_cpu_to_logical_apicid(void) | ||
177 | { | ||
178 | int cpu = smp_processor_id(); | ||
179 | int apicid = logical_smp_processor_id(); | ||
180 | int node = apic->apicid_to_node(apicid); | ||
181 | |||
182 | if (!node_online(node)) | ||
183 | node = first_online_node; | ||
184 | |||
185 | cpu_2_logical_apicid[cpu] = apicid; | ||
186 | map_cpu_to_node(cpu, node); | ||
187 | } | ||
188 | |||
189 | void numa_remove_cpu(int cpu) | ||
190 | { | ||
191 | cpu_2_logical_apicid[cpu] = BAD_APICID; | ||
192 | unmap_cpu_to_node(cpu); | ||
193 | } | ||
194 | #else | ||
195 | #define map_cpu_to_logical_apicid() do {} while (0) | ||
196 | #endif | ||
197 | |||
198 | /* | 138 | /* |
199 | * Report back to the Boot Processor. | 139 | * Report back to the Boot Processor. |
200 | * Running on AP. | 140 | * Running on AP. |
@@ -262,7 +202,6 @@ static void __cpuinit smp_callin(void) | |||
262 | apic->smp_callin_clear_local_apic(); | 202 | apic->smp_callin_clear_local_apic(); |
263 | setup_local_APIC(); | 203 | setup_local_APIC(); |
264 | end_local_APIC_setup(); | 204 | end_local_APIC_setup(); |
265 | map_cpu_to_logical_apicid(); | ||
266 | 205 | ||
267 | /* | 206 | /* |
268 | * Need to setup vector mappings before we enable interrupts. | 207 | * Need to setup vector mappings before we enable interrupts. |
@@ -397,6 +336,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) | |||
397 | 336 | ||
398 | if (cpu_has(c, X86_FEATURE_TOPOEXT)) { | 337 | if (cpu_has(c, X86_FEATURE_TOPOEXT)) { |
399 | if (c->phys_proc_id == o->phys_proc_id && | 338 | if (c->phys_proc_id == o->phys_proc_id && |
339 | per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) && | ||
400 | c->compute_unit_id == o->compute_unit_id) | 340 | c->compute_unit_id == o->compute_unit_id) |
401 | link_thread_siblings(cpu, i); | 341 | link_thread_siblings(cpu, i); |
402 | } else if (c->phys_proc_id == o->phys_proc_id && | 342 | } else if (c->phys_proc_id == o->phys_proc_id && |
@@ -951,7 +891,6 @@ static __init void disable_smp(void) | |||
951 | physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); | 891 | physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); |
952 | else | 892 | else |
953 | physid_set_mask_of_physid(0, &phys_cpu_present_map); | 893 | physid_set_mask_of_physid(0, &phys_cpu_present_map); |
954 | map_cpu_to_logical_apicid(); | ||
955 | cpumask_set_cpu(0, cpu_sibling_mask(0)); | 894 | cpumask_set_cpu(0, cpu_sibling_mask(0)); |
956 | cpumask_set_cpu(0, cpu_core_mask(0)); | 895 | cpumask_set_cpu(0, cpu_core_mask(0)); |
957 | } | 896 | } |
@@ -1087,9 +1026,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) | |||
1087 | smp_store_cpu_info(0); /* Final full version of the data */ | 1026 | smp_store_cpu_info(0); /* Final full version of the data */ |
1088 | cpumask_copy(cpu_callin_mask, cpumask_of(0)); | 1027 | cpumask_copy(cpu_callin_mask, cpumask_of(0)); |
1089 | mb(); | 1028 | mb(); |
1090 | #ifdef CONFIG_X86_32 | 1029 | |
1091 | boot_cpu_logical_apicid = logical_smp_processor_id(); | ||
1092 | #endif | ||
1093 | current_thread_info()->cpu = 0; /* needed? */ | 1030 | current_thread_info()->cpu = 0; /* needed? */ |
1094 | for_each_possible_cpu(i) { | 1031 | for_each_possible_cpu(i) { |
1095 | zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); | 1032 | zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); |
@@ -1130,8 +1067,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) | |||
1130 | 1067 | ||
1131 | bsp_end_local_APIC_setup(); | 1068 | bsp_end_local_APIC_setup(); |
1132 | 1069 | ||
1133 | map_cpu_to_logical_apicid(); | ||
1134 | |||
1135 | if (apic->setup_portio_remap) | 1070 | if (apic->setup_portio_remap) |
1136 | apic->setup_portio_remap(); | 1071 | apic->setup_portio_remap(); |
1137 | 1072 | ||
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 09df2f9a3d69..3e608edf9958 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -25,6 +25,7 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o | |||
25 | obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o | 25 | obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o |
26 | obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o | 26 | obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o |
27 | obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o | 27 | obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o |
28 | obj-$(CONFIG_NUMA_EMU) += numa_emulation.o | ||
28 | 29 | ||
29 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | 30 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o |
30 | 31 | ||
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index f21962c435ed..0919c26820d4 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c | |||
@@ -26,9 +26,7 @@ | |||
26 | #include <asm/apic.h> | 26 | #include <asm/apic.h> |
27 | #include <asm/amd_nb.h> | 27 | #include <asm/amd_nb.h> |
28 | 28 | ||
29 | static struct bootnode __initdata nodes[8]; | ||
30 | static unsigned char __initdata nodeids[8]; | 29 | static unsigned char __initdata nodeids[8]; |
31 | static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; | ||
32 | 30 | ||
33 | static __init int find_northbridge(void) | 31 | static __init int find_northbridge(void) |
34 | { | 32 | { |
@@ -51,7 +49,7 @@ static __init int find_northbridge(void) | |||
51 | return num; | 49 | return num; |
52 | } | 50 | } |
53 | 51 | ||
54 | return -1; | 52 | return -ENOENT; |
55 | } | 53 | } |
56 | 54 | ||
57 | static __init void early_get_boot_cpu_id(void) | 55 | static __init void early_get_boot_cpu_id(void) |
@@ -69,17 +67,18 @@ static __init void early_get_boot_cpu_id(void) | |||
69 | #endif | 67 | #endif |
70 | } | 68 | } |
71 | 69 | ||
72 | int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) | 70 | int __init amd_numa_init(void) |
73 | { | 71 | { |
74 | unsigned long start = PFN_PHYS(start_pfn); | 72 | unsigned long start = PFN_PHYS(0); |
75 | unsigned long end = PFN_PHYS(end_pfn); | 73 | unsigned long end = PFN_PHYS(max_pfn); |
76 | unsigned numnodes; | 74 | unsigned numnodes; |
77 | unsigned long prevbase; | 75 | unsigned long prevbase; |
78 | int i, nb, found = 0; | 76 | int i, j, nb; |
79 | u32 nodeid, reg; | 77 | u32 nodeid, reg; |
78 | unsigned int bits, cores, apicid_base; | ||
80 | 79 | ||
81 | if (!early_pci_allowed()) | 80 | if (!early_pci_allowed()) |
82 | return -1; | 81 | return -EINVAL; |
83 | 82 | ||
84 | nb = find_northbridge(); | 83 | nb = find_northbridge(); |
85 | if (nb < 0) | 84 | if (nb < 0) |
@@ -90,7 +89,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) | |||
90 | reg = read_pci_config(0, nb, 0, 0x60); | 89 | reg = read_pci_config(0, nb, 0, 0x60); |
91 | numnodes = ((reg >> 4) & 0xF) + 1; | 90 | numnodes = ((reg >> 4) & 0xF) + 1; |
92 | if (numnodes <= 1) | 91 | if (numnodes <= 1) |
93 | return -1; | 92 | return -ENOENT; |
94 | 93 | ||
95 | pr_info("Number of physical nodes %d\n", numnodes); | 94 | pr_info("Number of physical nodes %d\n", numnodes); |
96 | 95 | ||
@@ -121,9 +120,9 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) | |||
121 | if ((base >> 8) & 3 || (limit >> 8) & 3) { | 120 | if ((base >> 8) & 3 || (limit >> 8) & 3) { |
122 | pr_err("Node %d using interleaving mode %lx/%lx\n", | 121 | pr_err("Node %d using interleaving mode %lx/%lx\n", |
123 | nodeid, (base >> 8) & 3, (limit >> 8) & 3); | 122 | nodeid, (base >> 8) & 3, (limit >> 8) & 3); |
124 | return -1; | 123 | return -EINVAL; |
125 | } | 124 | } |
126 | if (node_isset(nodeid, nodes_parsed)) { | 125 | if (node_isset(nodeid, numa_nodes_parsed)) { |
127 | pr_info("Node %d already present, skipping\n", | 126 | pr_info("Node %d already present, skipping\n", |
128 | nodeid); | 127 | nodeid); |
129 | continue; | 128 | continue; |
@@ -160,117 +159,28 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) | |||
160 | if (prevbase > base) { | 159 | if (prevbase > base) { |
161 | pr_err("Node map not sorted %lx,%lx\n", | 160 | pr_err("Node map not sorted %lx,%lx\n", |
162 | prevbase, base); | 161 | prevbase, base); |
163 | return -1; | 162 | return -EINVAL; |
164 | } | 163 | } |
165 | 164 | ||
166 | pr_info("Node %d MemBase %016lx Limit %016lx\n", | 165 | pr_info("Node %d MemBase %016lx Limit %016lx\n", |
167 | nodeid, base, limit); | 166 | nodeid, base, limit); |
168 | 167 | ||
169 | found++; | ||
170 | |||
171 | nodes[nodeid].start = base; | ||
172 | nodes[nodeid].end = limit; | ||
173 | |||
174 | prevbase = base; | 168 | prevbase = base; |
175 | 169 | numa_add_memblk(nodeid, base, limit); | |
176 | node_set(nodeid, nodes_parsed); | 170 | node_set(nodeid, numa_nodes_parsed); |
177 | } | ||
178 | |||
179 | if (!found) | ||
180 | return -1; | ||
181 | return 0; | ||
182 | } | ||
183 | |||
184 | #ifdef CONFIG_NUMA_EMU | ||
185 | static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { | ||
186 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
187 | }; | ||
188 | |||
189 | void __init amd_get_nodes(struct bootnode *physnodes) | ||
190 | { | ||
191 | int i; | ||
192 | |||
193 | for_each_node_mask(i, nodes_parsed) { | ||
194 | physnodes[i].start = nodes[i].start; | ||
195 | physnodes[i].end = nodes[i].end; | ||
196 | } | 171 | } |
197 | } | ||
198 | |||
199 | static int __init find_node_by_addr(unsigned long addr) | ||
200 | { | ||
201 | int ret = NUMA_NO_NODE; | ||
202 | int i; | ||
203 | |||
204 | for (i = 0; i < 8; i++) | ||
205 | if (addr >= nodes[i].start && addr < nodes[i].end) { | ||
206 | ret = i; | ||
207 | break; | ||
208 | } | ||
209 | return ret; | ||
210 | } | ||
211 | 172 | ||
212 | /* | 173 | if (!nodes_weight(numa_nodes_parsed)) |
213 | * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be | 174 | return -ENOENT; |
214 | * setup to represent the physical topology but reflect the emulated | ||
215 | * environment. For each emulated node, the real node which it appears on is | ||
216 | * found and a fake pxm to nid mapping is created which mirrors the actual | ||
217 | * locality. node_distance() then represents the correct distances between | ||
218 | * emulated nodes by using the fake acpi mappings to pxms. | ||
219 | */ | ||
220 | void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes) | ||
221 | { | ||
222 | unsigned int bits; | ||
223 | unsigned int cores; | ||
224 | unsigned int apicid_base = 0; | ||
225 | int i; | ||
226 | 175 | ||
176 | /* | ||
177 | * We seem to have valid NUMA configuration. Map apicids to nodes | ||
178 | * using the coreid bits from early_identify_cpu. | ||
179 | */ | ||
227 | bits = boot_cpu_data.x86_coreid_bits; | 180 | bits = boot_cpu_data.x86_coreid_bits; |
228 | cores = 1 << bits; | 181 | cores = 1 << bits; |
229 | early_get_boot_cpu_id(); | ||
230 | if (boot_cpu_physical_apicid > 0) | ||
231 | apicid_base = boot_cpu_physical_apicid; | ||
232 | |||
233 | for (i = 0; i < nr_nodes; i++) { | ||
234 | int index; | ||
235 | int nid; | ||
236 | int j; | ||
237 | |||
238 | nid = find_node_by_addr(nodes[i].start); | ||
239 | if (nid == NUMA_NO_NODE) | ||
240 | continue; | ||
241 | |||
242 | index = nodeids[nid] << bits; | ||
243 | if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE) | ||
244 | for (j = apicid_base; j < cores + apicid_base; j++) | ||
245 | fake_apicid_to_node[index + j] = i; | ||
246 | #ifdef CONFIG_ACPI_NUMA | ||
247 | __acpi_map_pxm_to_node(nid, i); | ||
248 | #endif | ||
249 | } | ||
250 | memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); | ||
251 | } | ||
252 | #endif /* CONFIG_NUMA_EMU */ | ||
253 | |||
254 | int __init amd_scan_nodes(void) | ||
255 | { | ||
256 | unsigned int bits; | ||
257 | unsigned int cores; | ||
258 | unsigned int apicid_base; | ||
259 | int i; | ||
260 | |||
261 | BUG_ON(nodes_empty(nodes_parsed)); | ||
262 | node_possible_map = nodes_parsed; | ||
263 | memnode_shift = compute_hash_shift(nodes, 8, NULL); | ||
264 | if (memnode_shift < 0) { | ||
265 | pr_err("No NUMA node hash function found. Contact maintainer\n"); | ||
266 | return -1; | ||
267 | } | ||
268 | pr_info("Using node hash shift of %d\n", memnode_shift); | ||
269 | |||
270 | /* use the coreid bits from early_identify_cpu */ | ||
271 | bits = boot_cpu_data.x86_coreid_bits; | ||
272 | cores = (1<<bits); | ||
273 | apicid_base = 0; | 182 | apicid_base = 0; |
183 | |||
274 | /* get the APIC ID of the BSP early for systems with apicid lifting */ | 184 | /* get the APIC ID of the BSP early for systems with apicid lifting */ |
275 | early_get_boot_cpu_id(); | 185 | early_get_boot_cpu_id(); |
276 | if (boot_cpu_physical_apicid > 0) { | 186 | if (boot_cpu_physical_apicid > 0) { |
@@ -278,17 +188,9 @@ int __init amd_scan_nodes(void) | |||
278 | apicid_base = boot_cpu_physical_apicid; | 188 | apicid_base = boot_cpu_physical_apicid; |
279 | } | 189 | } |
280 | 190 | ||
281 | for_each_node_mask(i, node_possible_map) { | 191 | for_each_node_mask(i, numa_nodes_parsed) |
282 | int j; | ||
283 | |||
284 | memblock_x86_register_active_regions(i, | ||
285 | nodes[i].start >> PAGE_SHIFT, | ||
286 | nodes[i].end >> PAGE_SHIFT); | ||
287 | for (j = apicid_base; j < cores + apicid_base; j++) | 192 | for (j = apicid_base; j < cores + apicid_base; j++) |
288 | apicid_to_node[(i << bits) + j] = i; | 193 | set_apicid_to_node((i << bits) + j, i); |
289 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
290 | } | ||
291 | 194 | ||
292 | numa_init_array(); | ||
293 | return 0; | 195 | return 0; |
294 | } | 196 | } |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 947f42abe820..286d289b039b 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
@@ -18,9 +18,9 @@ | |||
18 | 18 | ||
19 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | 19 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); |
20 | 20 | ||
21 | unsigned long __initdata e820_table_start; | 21 | unsigned long __initdata pgt_buf_start; |
22 | unsigned long __meminitdata e820_table_end; | 22 | unsigned long __meminitdata pgt_buf_end; |
23 | unsigned long __meminitdata e820_table_top; | 23 | unsigned long __meminitdata pgt_buf_top; |
24 | 24 | ||
25 | int after_bootmem; | 25 | int after_bootmem; |
26 | 26 | ||
@@ -33,7 +33,7 @@ int direct_gbpages | |||
33 | static void __init find_early_table_space(unsigned long end, int use_pse, | 33 | static void __init find_early_table_space(unsigned long end, int use_pse, |
34 | int use_gbpages) | 34 | int use_gbpages) |
35 | { | 35 | { |
36 | unsigned long puds, pmds, ptes, tables, start; | 36 | unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; |
37 | phys_addr_t base; | 37 | phys_addr_t base; |
38 | 38 | ||
39 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | 39 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; |
@@ -65,29 +65,20 @@ static void __init find_early_table_space(unsigned long end, int use_pse, | |||
65 | #ifdef CONFIG_X86_32 | 65 | #ifdef CONFIG_X86_32 |
66 | /* for fixmap */ | 66 | /* for fixmap */ |
67 | tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); | 67 | tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); |
68 | #endif | ||
69 | 68 | ||
70 | /* | 69 | good_end = max_pfn_mapped << PAGE_SHIFT; |
71 | * RED-PEN putting page tables only on node 0 could | ||
72 | * cause a hotspot and fill up ZONE_DMA. The page tables | ||
73 | * need roughly 0.5KB per GB. | ||
74 | */ | ||
75 | #ifdef CONFIG_X86_32 | ||
76 | start = 0x7000; | ||
77 | #else | ||
78 | start = 0x8000; | ||
79 | #endif | 70 | #endif |
80 | base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT, | 71 | |
81 | tables, PAGE_SIZE); | 72 | base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); |
82 | if (base == MEMBLOCK_ERROR) | 73 | if (base == MEMBLOCK_ERROR) |
83 | panic("Cannot find space for the kernel page tables"); | 74 | panic("Cannot find space for the kernel page tables"); |
84 | 75 | ||
85 | e820_table_start = base >> PAGE_SHIFT; | 76 | pgt_buf_start = base >> PAGE_SHIFT; |
86 | e820_table_end = e820_table_start; | 77 | pgt_buf_end = pgt_buf_start; |
87 | e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); | 78 | pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); |
88 | 79 | ||
89 | printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", | 80 | printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", |
90 | end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); | 81 | end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); |
91 | } | 82 | } |
92 | 83 | ||
93 | struct map_range { | 84 | struct map_range { |
@@ -279,30 +270,11 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
279 | load_cr3(swapper_pg_dir); | 270 | load_cr3(swapper_pg_dir); |
280 | #endif | 271 | #endif |
281 | 272 | ||
282 | #ifdef CONFIG_X86_64 | ||
283 | if (!after_bootmem && !start) { | ||
284 | pud_t *pud; | ||
285 | pmd_t *pmd; | ||
286 | |||
287 | mmu_cr4_features = read_cr4(); | ||
288 | |||
289 | /* | ||
290 | * _brk_end cannot change anymore, but it and _end may be | ||
291 | * located on different 2M pages. cleanup_highmap(), however, | ||
292 | * can only consider _end when it runs, so destroy any | ||
293 | * mappings beyond _brk_end here. | ||
294 | */ | ||
295 | pud = pud_offset(pgd_offset_k(_brk_end), _brk_end); | ||
296 | pmd = pmd_offset(pud, _brk_end - 1); | ||
297 | while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) | ||
298 | pmd_clear(pmd); | ||
299 | } | ||
300 | #endif | ||
301 | __flush_tlb_all(); | 273 | __flush_tlb_all(); |
302 | 274 | ||
303 | if (!after_bootmem && e820_table_end > e820_table_start) | 275 | if (!after_bootmem && pgt_buf_end > pgt_buf_start) |
304 | memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT, | 276 | memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT, |
305 | e820_table_end << PAGE_SHIFT, "PGTABLE"); | 277 | pgt_buf_end << PAGE_SHIFT, "PGTABLE"); |
306 | 278 | ||
307 | if (!after_bootmem) | 279 | if (!after_bootmem) |
308 | early_memtest(start, end); | 280 | early_memtest(start, end); |
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c821074b7f0b..73ad7ebd6e9c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -62,10 +62,10 @@ bool __read_mostly __vmalloc_start_set = false; | |||
62 | 62 | ||
63 | static __init void *alloc_low_page(void) | 63 | static __init void *alloc_low_page(void) |
64 | { | 64 | { |
65 | unsigned long pfn = e820_table_end++; | 65 | unsigned long pfn = pgt_buf_end++; |
66 | void *adr; | 66 | void *adr; |
67 | 67 | ||
68 | if (pfn >= e820_table_top) | 68 | if (pfn >= pgt_buf_top) |
69 | panic("alloc_low_page: ran out of memory"); | 69 | panic("alloc_low_page: ran out of memory"); |
70 | 70 | ||
71 | adr = __va(pfn * PAGE_SIZE); | 71 | adr = __va(pfn * PAGE_SIZE); |
@@ -163,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, | |||
163 | if (pmd_idx_kmap_begin != pmd_idx_kmap_end | 163 | if (pmd_idx_kmap_begin != pmd_idx_kmap_end |
164 | && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin | 164 | && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin |
165 | && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end | 165 | && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end |
166 | && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start | 166 | && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start |
167 | || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { | 167 | || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) { |
168 | pte_t *newpte; | 168 | pte_t *newpte; |
169 | int i; | 169 | int i; |
170 | 170 | ||
@@ -644,8 +644,7 @@ void __init find_low_pfn_range(void) | |||
644 | } | 644 | } |
645 | 645 | ||
646 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 646 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
647 | void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | 647 | void __init initmem_init(void) |
648 | int acpi, int k8) | ||
649 | { | 648 | { |
650 | #ifdef CONFIG_HIGHMEM | 649 | #ifdef CONFIG_HIGHMEM |
651 | highstart_pfn = highend_pfn = max_pfn; | 650 | highstart_pfn = highend_pfn = max_pfn; |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index c14a5422e152..a08a62cb136e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -314,7 +314,7 @@ void __init cleanup_highmap(void) | |||
314 | 314 | ||
315 | static __ref void *alloc_low_page(unsigned long *phys) | 315 | static __ref void *alloc_low_page(unsigned long *phys) |
316 | { | 316 | { |
317 | unsigned long pfn = e820_table_end++; | 317 | unsigned long pfn = pgt_buf_end++; |
318 | void *adr; | 318 | void *adr; |
319 | 319 | ||
320 | if (after_bootmem) { | 320 | if (after_bootmem) { |
@@ -324,7 +324,7 @@ static __ref void *alloc_low_page(unsigned long *phys) | |||
324 | return adr; | 324 | return adr; |
325 | } | 325 | } |
326 | 326 | ||
327 | if (pfn >= e820_table_top) | 327 | if (pfn >= pgt_buf_top) |
328 | panic("alloc_low_page: ran out of memory"); | 328 | panic("alloc_low_page: ran out of memory"); |
329 | 329 | ||
330 | adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); | 330 | adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); |
@@ -333,12 +333,28 @@ static __ref void *alloc_low_page(unsigned long *phys) | |||
333 | return adr; | 333 | return adr; |
334 | } | 334 | } |
335 | 335 | ||
336 | static __ref void *map_low_page(void *virt) | ||
337 | { | ||
338 | void *adr; | ||
339 | unsigned long phys, left; | ||
340 | |||
341 | if (after_bootmem) | ||
342 | return virt; | ||
343 | |||
344 | phys = __pa(virt); | ||
345 | left = phys & (PAGE_SIZE - 1); | ||
346 | adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE); | ||
347 | adr = (void *)(((unsigned long)adr) | left); | ||
348 | |||
349 | return adr; | ||
350 | } | ||
351 | |||
336 | static __ref void unmap_low_page(void *adr) | 352 | static __ref void unmap_low_page(void *adr) |
337 | { | 353 | { |
338 | if (after_bootmem) | 354 | if (after_bootmem) |
339 | return; | 355 | return; |
340 | 356 | ||
341 | early_iounmap(adr, PAGE_SIZE); | 357 | early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE); |
342 | } | 358 | } |
343 | 359 | ||
344 | static unsigned long __meminit | 360 | static unsigned long __meminit |
@@ -386,15 +402,6 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, | |||
386 | } | 402 | } |
387 | 403 | ||
388 | static unsigned long __meminit | 404 | static unsigned long __meminit |
389 | phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, | ||
390 | pgprot_t prot) | ||
391 | { | ||
392 | pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); | ||
393 | |||
394 | return phys_pte_init(pte, address, end, prot); | ||
395 | } | ||
396 | |||
397 | static unsigned long __meminit | ||
398 | phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | 405 | phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, |
399 | unsigned long page_size_mask, pgprot_t prot) | 406 | unsigned long page_size_mask, pgprot_t prot) |
400 | { | 407 | { |
@@ -420,8 +427,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | |||
420 | if (pmd_val(*pmd)) { | 427 | if (pmd_val(*pmd)) { |
421 | if (!pmd_large(*pmd)) { | 428 | if (!pmd_large(*pmd)) { |
422 | spin_lock(&init_mm.page_table_lock); | 429 | spin_lock(&init_mm.page_table_lock); |
423 | last_map_addr = phys_pte_update(pmd, address, | 430 | pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd)); |
431 | last_map_addr = phys_pte_init(pte, address, | ||
424 | end, prot); | 432 | end, prot); |
433 | unmap_low_page(pte); | ||
425 | spin_unlock(&init_mm.page_table_lock); | 434 | spin_unlock(&init_mm.page_table_lock); |
426 | continue; | 435 | continue; |
427 | } | 436 | } |
@@ -468,18 +477,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | |||
468 | } | 477 | } |
469 | 478 | ||
470 | static unsigned long __meminit | 479 | static unsigned long __meminit |
471 | phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, | ||
472 | unsigned long page_size_mask, pgprot_t prot) | ||
473 | { | ||
474 | pmd_t *pmd = pmd_offset(pud, 0); | ||
475 | unsigned long last_map_addr; | ||
476 | |||
477 | last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); | ||
478 | __flush_tlb_all(); | ||
479 | return last_map_addr; | ||
480 | } | ||
481 | |||
482 | static unsigned long __meminit | ||
483 | phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | 480 | phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, |
484 | unsigned long page_size_mask) | 481 | unsigned long page_size_mask) |
485 | { | 482 | { |
@@ -504,8 +501,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | |||
504 | 501 | ||
505 | if (pud_val(*pud)) { | 502 | if (pud_val(*pud)) { |
506 | if (!pud_large(*pud)) { | 503 | if (!pud_large(*pud)) { |
507 | last_map_addr = phys_pmd_update(pud, addr, end, | 504 | pmd = map_low_page(pmd_offset(pud, 0)); |
505 | last_map_addr = phys_pmd_init(pmd, addr, end, | ||
508 | page_size_mask, prot); | 506 | page_size_mask, prot); |
507 | unmap_low_page(pmd); | ||
508 | __flush_tlb_all(); | ||
509 | continue; | 509 | continue; |
510 | } | 510 | } |
511 | /* | 511 | /* |
@@ -553,17 +553,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | |||
553 | return last_map_addr; | 553 | return last_map_addr; |
554 | } | 554 | } |
555 | 555 | ||
556 | static unsigned long __meminit | ||
557 | phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, | ||
558 | unsigned long page_size_mask) | ||
559 | { | ||
560 | pud_t *pud; | ||
561 | |||
562 | pud = (pud_t *)pgd_page_vaddr(*pgd); | ||
563 | |||
564 | return phys_pud_init(pud, addr, end, page_size_mask); | ||
565 | } | ||
566 | |||
567 | unsigned long __meminit | 556 | unsigned long __meminit |
568 | kernel_physical_mapping_init(unsigned long start, | 557 | kernel_physical_mapping_init(unsigned long start, |
569 | unsigned long end, | 558 | unsigned long end, |
@@ -587,8 +576,10 @@ kernel_physical_mapping_init(unsigned long start, | |||
587 | next = end; | 576 | next = end; |
588 | 577 | ||
589 | if (pgd_val(*pgd)) { | 578 | if (pgd_val(*pgd)) { |
590 | last_map_addr = phys_pud_update(pgd, __pa(start), | 579 | pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd)); |
580 | last_map_addr = phys_pud_init(pud, __pa(start), | ||
591 | __pa(end), page_size_mask); | 581 | __pa(end), page_size_mask); |
582 | unmap_low_page(pud); | ||
592 | continue; | 583 | continue; |
593 | } | 584 | } |
594 | 585 | ||
@@ -612,10 +603,9 @@ kernel_physical_mapping_init(unsigned long start, | |||
612 | } | 603 | } |
613 | 604 | ||
614 | #ifndef CONFIG_NUMA | 605 | #ifndef CONFIG_NUMA |
615 | void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | 606 | void __init initmem_init(void) |
616 | int acpi, int k8) | ||
617 | { | 607 | { |
618 | memblock_x86_register_active_regions(0, start_pfn, end_pfn); | 608 | memblock_x86_register_active_regions(0, 0, max_pfn); |
619 | } | 609 | } |
620 | #endif | 610 | #endif |
621 | 611 | ||
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index ebf6d7887a38..9559d360fde7 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c | |||
@@ -26,12 +26,50 @@ static __init int numa_setup(char *opt) | |||
26 | early_param("numa", numa_setup); | 26 | early_param("numa", numa_setup); |
27 | 27 | ||
28 | /* | 28 | /* |
29 | * Which logical CPUs are on which nodes | 29 | * apicid, cpu, node mappings |
30 | */ | 30 | */ |
31 | s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | ||
32 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
33 | }; | ||
34 | |||
31 | cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; | 35 | cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; |
32 | EXPORT_SYMBOL(node_to_cpumask_map); | 36 | EXPORT_SYMBOL(node_to_cpumask_map); |
33 | 37 | ||
34 | /* | 38 | /* |
39 | * Map cpu index to node index | ||
40 | */ | ||
41 | DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); | ||
42 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); | ||
43 | |||
44 | void __cpuinit numa_set_node(int cpu, int node) | ||
45 | { | ||
46 | int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); | ||
47 | |||
48 | /* early setting, no percpu area yet */ | ||
49 | if (cpu_to_node_map) { | ||
50 | cpu_to_node_map[cpu] = node; | ||
51 | return; | ||
52 | } | ||
53 | |||
54 | #ifdef CONFIG_DEBUG_PER_CPU_MAPS | ||
55 | if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { | ||
56 | printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); | ||
57 | dump_stack(); | ||
58 | return; | ||
59 | } | ||
60 | #endif | ||
61 | per_cpu(x86_cpu_to_node_map, cpu) = node; | ||
62 | |||
63 | if (node != NUMA_NO_NODE) | ||
64 | set_cpu_numa_node(cpu, node); | ||
65 | } | ||
66 | |||
67 | void __cpuinit numa_clear_node(int cpu) | ||
68 | { | ||
69 | numa_set_node(cpu, NUMA_NO_NODE); | ||
70 | } | ||
71 | |||
72 | /* | ||
35 | * Allocate node_to_cpumask_map based on number of available nodes | 73 | * Allocate node_to_cpumask_map based on number of available nodes |
36 | * Requires node_possible_map to be valid. | 74 | * Requires node_possible_map to be valid. |
37 | * | 75 | * |
@@ -57,7 +95,174 @@ void __init setup_node_to_cpumask_map(void) | |||
57 | pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); | 95 | pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); |
58 | } | 96 | } |
59 | 97 | ||
60 | #ifdef CONFIG_DEBUG_PER_CPU_MAPS | 98 | /* |
99 | * There are unfortunately some poorly designed mainboards around that | ||
100 | * only connect memory to a single CPU. This breaks the 1:1 cpu->node | ||
101 | * mapping. To avoid this fill in the mapping for all possible CPUs, | ||
102 | * as the number of CPUs is not known yet. We round robin the existing | ||
103 | * nodes. | ||
104 | */ | ||
105 | void __init numa_init_array(void) | ||
106 | { | ||
107 | int rr, i; | ||
108 | |||
109 | rr = first_node(node_online_map); | ||
110 | for (i = 0; i < nr_cpu_ids; i++) { | ||
111 | if (early_cpu_to_node(i) != NUMA_NO_NODE) | ||
112 | continue; | ||
113 | numa_set_node(i, rr); | ||
114 | rr = next_node(rr, node_online_map); | ||
115 | if (rr == MAX_NUMNODES) | ||
116 | rr = first_node(node_online_map); | ||
117 | } | ||
118 | } | ||
119 | |||
120 | static __init int find_near_online_node(int node) | ||
121 | { | ||
122 | int n, val; | ||
123 | int min_val = INT_MAX; | ||
124 | int best_node = -1; | ||
125 | |||
126 | for_each_online_node(n) { | ||
127 | val = node_distance(node, n); | ||
128 | |||
129 | if (val < min_val) { | ||
130 | min_val = val; | ||
131 | best_node = n; | ||
132 | } | ||
133 | } | ||
134 | |||
135 | return best_node; | ||
136 | } | ||
137 | |||
138 | /* | ||
139 | * Setup early cpu_to_node. | ||
140 | * | ||
141 | * Populate cpu_to_node[] only if x86_cpu_to_apicid[], | ||
142 | * and apicid_to_node[] tables have valid entries for a CPU. | ||
143 | * This means we skip cpu_to_node[] initialisation for NUMA | ||
144 | * emulation and faking node case (when running a kernel compiled | ||
145 | * for NUMA on a non NUMA box), which is OK as cpu_to_node[] | ||
146 | * is already initialized in a round robin manner at numa_init_array, | ||
147 | * prior to this call, and this initialization is good enough | ||
148 | * for the fake NUMA cases. | ||
149 | * | ||
150 | * Called before the per_cpu areas are setup. | ||
151 | */ | ||
152 | void __init init_cpu_to_node(void) | ||
153 | { | ||
154 | int cpu; | ||
155 | u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); | ||
156 | |||
157 | BUG_ON(cpu_to_apicid == NULL); | ||
158 | |||
159 | for_each_possible_cpu(cpu) { | ||
160 | int node = numa_cpu_node(cpu); | ||
161 | |||
162 | if (node == NUMA_NO_NODE) | ||
163 | continue; | ||
164 | if (!node_online(node)) | ||
165 | node = find_near_online_node(node); | ||
166 | numa_set_node(cpu, node); | ||
167 | } | ||
168 | } | ||
169 | |||
170 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS | ||
171 | |||
172 | # ifndef CONFIG_NUMA_EMU | ||
173 | void __cpuinit numa_add_cpu(int cpu) | ||
174 | { | ||
175 | cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | ||
176 | } | ||
177 | |||
178 | void __cpuinit numa_remove_cpu(int cpu) | ||
179 | { | ||
180 | cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | ||
181 | } | ||
182 | # endif /* !CONFIG_NUMA_EMU */ | ||
183 | |||
184 | #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ | ||
185 | |||
186 | int __cpu_to_node(int cpu) | ||
187 | { | ||
188 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) { | ||
189 | printk(KERN_WARNING | ||
190 | "cpu_to_node(%d): usage too early!\n", cpu); | ||
191 | dump_stack(); | ||
192 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | ||
193 | } | ||
194 | return per_cpu(x86_cpu_to_node_map, cpu); | ||
195 | } | ||
196 | EXPORT_SYMBOL(__cpu_to_node); | ||
197 | |||
198 | /* | ||
199 | * Same function as cpu_to_node() but used if called before the | ||
200 | * per_cpu areas are setup. | ||
201 | */ | ||
202 | int early_cpu_to_node(int cpu) | ||
203 | { | ||
204 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) | ||
205 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | ||
206 | |||
207 | if (!cpu_possible(cpu)) { | ||
208 | printk(KERN_WARNING | ||
209 | "early_cpu_to_node(%d): no per_cpu area!\n", cpu); | ||
210 | dump_stack(); | ||
211 | return NUMA_NO_NODE; | ||
212 | } | ||
213 | return per_cpu(x86_cpu_to_node_map, cpu); | ||
214 | } | ||
215 | |||
216 | struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) | ||
217 | { | ||
218 | int node = early_cpu_to_node(cpu); | ||
219 | struct cpumask *mask; | ||
220 | char buf[64]; | ||
221 | |||
222 | if (node == NUMA_NO_NODE) { | ||
223 | /* early_cpu_to_node() already emits a warning and trace */ | ||
224 | return NULL; | ||
225 | } | ||
226 | mask = node_to_cpumask_map[node]; | ||
227 | if (!mask) { | ||
228 | pr_err("node_to_cpumask_map[%i] NULL\n", node); | ||
229 | dump_stack(); | ||
230 | return NULL; | ||
231 | } | ||
232 | |||
233 | cpulist_scnprintf(buf, sizeof(buf), mask); | ||
234 | printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", | ||
235 | enable ? "numa_add_cpu" : "numa_remove_cpu", | ||
236 | cpu, node, buf); | ||
237 | return mask; | ||
238 | } | ||
239 | |||
240 | # ifndef CONFIG_NUMA_EMU | ||
241 | static void __cpuinit numa_set_cpumask(int cpu, int enable) | ||
242 | { | ||
243 | struct cpumask *mask; | ||
244 | |||
245 | mask = debug_cpumask_set_cpu(cpu, enable); | ||
246 | if (!mask) | ||
247 | return; | ||
248 | |||
249 | if (enable) | ||
250 | cpumask_set_cpu(cpu, mask); | ||
251 | else | ||
252 | cpumask_clear_cpu(cpu, mask); | ||
253 | } | ||
254 | |||
255 | void __cpuinit numa_add_cpu(int cpu) | ||
256 | { | ||
257 | numa_set_cpumask(cpu, 1); | ||
258 | } | ||
259 | |||
260 | void __cpuinit numa_remove_cpu(int cpu) | ||
261 | { | ||
262 | numa_set_cpumask(cpu, 0); | ||
263 | } | ||
264 | # endif /* !CONFIG_NUMA_EMU */ | ||
265 | |||
61 | /* | 266 | /* |
62 | * Returns a pointer to the bitmask of CPUs on Node 'node'. | 267 | * Returns a pointer to the bitmask of CPUs on Node 'node'. |
63 | */ | 268 | */ |
@@ -80,4 +285,5 @@ const struct cpumask *cpumask_of_node(int node) | |||
80 | return node_to_cpumask_map[node]; | 285 | return node_to_cpumask_map[node]; |
81 | } | 286 | } |
82 | EXPORT_SYMBOL(cpumask_of_node); | 287 | EXPORT_SYMBOL(cpumask_of_node); |
83 | #endif | 288 | |
289 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ | ||
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 84a3e4c9f277..bde3906420df 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c | |||
@@ -110,6 +110,12 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); | |||
110 | 110 | ||
111 | static unsigned long kva_start_pfn; | 111 | static unsigned long kva_start_pfn; |
112 | static unsigned long kva_pages; | 112 | static unsigned long kva_pages; |
113 | |||
114 | int __cpuinit numa_cpu_node(int cpu) | ||
115 | { | ||
116 | return apic->x86_32_numa_cpu_node(cpu); | ||
117 | } | ||
118 | |||
113 | /* | 119 | /* |
114 | * FLAT - support for basic PC memory model with discontig enabled, essentially | 120 | * FLAT - support for basic PC memory model with discontig enabled, essentially |
115 | * a single node with all available processors in it with a flat | 121 | * a single node with all available processors in it with a flat |
@@ -346,8 +352,7 @@ static void init_remap_allocator(int nid) | |||
346 | (ulong) node_remap_end_vaddr[nid]); | 352 | (ulong) node_remap_end_vaddr[nid]); |
347 | } | 353 | } |
348 | 354 | ||
349 | void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | 355 | void __init initmem_init(void) |
350 | int acpi, int k8) | ||
351 | { | 356 | { |
352 | int nid; | 357 | int nid; |
353 | long kva_target_pfn; | 358 | long kva_target_pfn; |
@@ -361,6 +366,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | |||
361 | */ | 366 | */ |
362 | 367 | ||
363 | get_memcfg_numa(); | 368 | get_memcfg_numa(); |
369 | numa_init_array(); | ||
364 | 370 | ||
365 | kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); | 371 | kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); |
366 | 372 | ||
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 1337c51b07d7..9ec0f209a6a4 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -13,31 +13,30 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/nodemask.h> | 14 | #include <linux/nodemask.h> |
15 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
16 | #include <linux/acpi.h> | ||
16 | 17 | ||
17 | #include <asm/e820.h> | 18 | #include <asm/e820.h> |
18 | #include <asm/proto.h> | 19 | #include <asm/proto.h> |
19 | #include <asm/dma.h> | 20 | #include <asm/dma.h> |
20 | #include <asm/numa.h> | ||
21 | #include <asm/acpi.h> | 21 | #include <asm/acpi.h> |
22 | #include <asm/amd_nb.h> | 22 | #include <asm/amd_nb.h> |
23 | 23 | ||
24 | #include "numa_internal.h" | ||
25 | |||
24 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; | 26 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; |
25 | EXPORT_SYMBOL(node_data); | 27 | EXPORT_SYMBOL(node_data); |
26 | 28 | ||
27 | struct memnode memnode; | 29 | nodemask_t numa_nodes_parsed __initdata; |
28 | 30 | ||
29 | s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | 31 | struct memnode memnode; |
30 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
31 | }; | ||
32 | 32 | ||
33 | static unsigned long __initdata nodemap_addr; | 33 | static unsigned long __initdata nodemap_addr; |
34 | static unsigned long __initdata nodemap_size; | 34 | static unsigned long __initdata nodemap_size; |
35 | 35 | ||
36 | /* | 36 | static struct numa_meminfo numa_meminfo __initdata; |
37 | * Map cpu index to node index | 37 | |
38 | */ | 38 | static int numa_distance_cnt; |
39 | DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); | 39 | static u8 *numa_distance; |
40 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); | ||
41 | 40 | ||
42 | /* | 41 | /* |
43 | * Given a shift value, try to populate memnodemap[] | 42 | * Given a shift value, try to populate memnodemap[] |
@@ -46,16 +45,15 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); | |||
46 | * 0 if memnodmap[] too small (of shift too small) | 45 | * 0 if memnodmap[] too small (of shift too small) |
47 | * -1 if node overlap or lost ram (shift too big) | 46 | * -1 if node overlap or lost ram (shift too big) |
48 | */ | 47 | */ |
49 | static int __init populate_memnodemap(const struct bootnode *nodes, | 48 | static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift) |
50 | int numnodes, int shift, int *nodeids) | ||
51 | { | 49 | { |
52 | unsigned long addr, end; | 50 | unsigned long addr, end; |
53 | int i, res = -1; | 51 | int i, res = -1; |
54 | 52 | ||
55 | memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); | 53 | memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); |
56 | for (i = 0; i < numnodes; i++) { | 54 | for (i = 0; i < mi->nr_blks; i++) { |
57 | addr = nodes[i].start; | 55 | addr = mi->blk[i].start; |
58 | end = nodes[i].end; | 56 | end = mi->blk[i].end; |
59 | if (addr >= end) | 57 | if (addr >= end) |
60 | continue; | 58 | continue; |
61 | if ((end >> shift) >= memnodemapsize) | 59 | if ((end >> shift) >= memnodemapsize) |
@@ -63,12 +61,7 @@ static int __init populate_memnodemap(const struct bootnode *nodes, | |||
63 | do { | 61 | do { |
64 | if (memnodemap[addr >> shift] != NUMA_NO_NODE) | 62 | if (memnodemap[addr >> shift] != NUMA_NO_NODE) |
65 | return -1; | 63 | return -1; |
66 | 64 | memnodemap[addr >> shift] = mi->blk[i].nid; | |
67 | if (!nodeids) | ||
68 | memnodemap[addr >> shift] = i; | ||
69 | else | ||
70 | memnodemap[addr >> shift] = nodeids[i]; | ||
71 | |||
72 | addr += (1UL << shift); | 65 | addr += (1UL << shift); |
73 | } while (addr < end); | 66 | } while (addr < end); |
74 | res = 1; | 67 | res = 1; |
@@ -86,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void) | |||
86 | 79 | ||
87 | addr = 0x8000; | 80 | addr = 0x8000; |
88 | nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); | 81 | nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); |
89 | nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT, | 82 | nodemap_addr = memblock_find_in_range(addr, get_max_mapped(), |
90 | nodemap_size, L1_CACHE_BYTES); | 83 | nodemap_size, L1_CACHE_BYTES); |
91 | if (nodemap_addr == MEMBLOCK_ERROR) { | 84 | if (nodemap_addr == MEMBLOCK_ERROR) { |
92 | printk(KERN_ERR | 85 | printk(KERN_ERR |
@@ -106,16 +99,15 @@ static int __init allocate_cachealigned_memnodemap(void) | |||
106 | * The LSB of all start and end addresses in the node map is the value of the | 99 | * The LSB of all start and end addresses in the node map is the value of the |
107 | * maximum possible shift. | 100 | * maximum possible shift. |
108 | */ | 101 | */ |
109 | static int __init extract_lsb_from_nodes(const struct bootnode *nodes, | 102 | static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi) |
110 | int numnodes) | ||
111 | { | 103 | { |
112 | int i, nodes_used = 0; | 104 | int i, nodes_used = 0; |
113 | unsigned long start, end; | 105 | unsigned long start, end; |
114 | unsigned long bitfield = 0, memtop = 0; | 106 | unsigned long bitfield = 0, memtop = 0; |
115 | 107 | ||
116 | for (i = 0; i < numnodes; i++) { | 108 | for (i = 0; i < mi->nr_blks; i++) { |
117 | start = nodes[i].start; | 109 | start = mi->blk[i].start; |
118 | end = nodes[i].end; | 110 | end = mi->blk[i].end; |
119 | if (start >= end) | 111 | if (start >= end) |
120 | continue; | 112 | continue; |
121 | bitfield |= start; | 113 | bitfield |= start; |
@@ -131,18 +123,17 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes, | |||
131 | return i; | 123 | return i; |
132 | } | 124 | } |
133 | 125 | ||
134 | int __init compute_hash_shift(struct bootnode *nodes, int numnodes, | 126 | static int __init compute_hash_shift(const struct numa_meminfo *mi) |
135 | int *nodeids) | ||
136 | { | 127 | { |
137 | int shift; | 128 | int shift; |
138 | 129 | ||
139 | shift = extract_lsb_from_nodes(nodes, numnodes); | 130 | shift = extract_lsb_from_nodes(mi); |
140 | if (allocate_cachealigned_memnodemap()) | 131 | if (allocate_cachealigned_memnodemap()) |
141 | return -1; | 132 | return -1; |
142 | printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", | 133 | printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", |
143 | shift); | 134 | shift); |
144 | 135 | ||
145 | if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { | 136 | if (populate_memnodemap(mi, shift) != 1) { |
146 | printk(KERN_INFO "Your memory is not aligned you need to " | 137 | printk(KERN_INFO "Your memory is not aligned you need to " |
147 | "rebuild your kernel with a bigger NODEMAPSIZE " | 138 | "rebuild your kernel with a bigger NODEMAPSIZE " |
148 | "shift=%d\n", shift); | 139 | "shift=%d\n", shift); |
@@ -188,6 +179,63 @@ static void * __init early_node_mem(int nodeid, unsigned long start, | |||
188 | return NULL; | 179 | return NULL; |
189 | } | 180 | } |
190 | 181 | ||
182 | static int __init numa_add_memblk_to(int nid, u64 start, u64 end, | ||
183 | struct numa_meminfo *mi) | ||
184 | { | ||
185 | /* ignore zero length blks */ | ||
186 | if (start == end) | ||
187 | return 0; | ||
188 | |||
189 | /* whine about and ignore invalid blks */ | ||
190 | if (start > end || nid < 0 || nid >= MAX_NUMNODES) { | ||
191 | pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", | ||
192 | nid, start, end); | ||
193 | return 0; | ||
194 | } | ||
195 | |||
196 | if (mi->nr_blks >= NR_NODE_MEMBLKS) { | ||
197 | pr_err("NUMA: too many memblk ranges\n"); | ||
198 | return -EINVAL; | ||
199 | } | ||
200 | |||
201 | mi->blk[mi->nr_blks].start = start; | ||
202 | mi->blk[mi->nr_blks].end = end; | ||
203 | mi->blk[mi->nr_blks].nid = nid; | ||
204 | mi->nr_blks++; | ||
205 | return 0; | ||
206 | } | ||
207 | |||
208 | /** | ||
209 | * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo | ||
210 | * @idx: Index of memblk to remove | ||
211 | * @mi: numa_meminfo to remove memblk from | ||
212 | * | ||
213 | * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and | ||
214 | * decrementing @mi->nr_blks. | ||
215 | */ | ||
216 | void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) | ||
217 | { | ||
218 | mi->nr_blks--; | ||
219 | memmove(&mi->blk[idx], &mi->blk[idx + 1], | ||
220 | (mi->nr_blks - idx) * sizeof(mi->blk[0])); | ||
221 | } | ||
222 | |||
223 | /** | ||
224 | * numa_add_memblk - Add one numa_memblk to numa_meminfo | ||
225 | * @nid: NUMA node ID of the new memblk | ||
226 | * @start: Start address of the new memblk | ||
227 | * @end: End address of the new memblk | ||
228 | * | ||
229 | * Add a new memblk to the default numa_meminfo. | ||
230 | * | ||
231 | * RETURNS: | ||
232 | * 0 on success, -errno on failure. | ||
233 | */ | ||
234 | int __init numa_add_memblk(int nid, u64 start, u64 end) | ||
235 | { | ||
236 | return numa_add_memblk_to(nid, start, end, &numa_meminfo); | ||
237 | } | ||
238 | |||
191 | /* Initialize bootmem allocator for a node */ | 239 | /* Initialize bootmem allocator for a node */ |
192 | void __init | 240 | void __init |
193 | setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | 241 | setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) |
@@ -234,692 +282,386 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | |||
234 | node_set_online(nodeid); | 282 | node_set_online(nodeid); |
235 | } | 283 | } |
236 | 284 | ||
237 | /* | 285 | /** |
238 | * There are unfortunately some poorly designed mainboards around that | 286 | * numa_cleanup_meminfo - Cleanup a numa_meminfo |
239 | * only connect memory to a single CPU. This breaks the 1:1 cpu->node | 287 | * @mi: numa_meminfo to clean up |
240 | * mapping. To avoid this fill in the mapping for all possible CPUs, | 288 | * |
241 | * as the number of CPUs is not known yet. We round robin the existing | 289 | * Sanitize @mi by merging and removing unncessary memblks. Also check for |
242 | * nodes. | 290 | * conflicts and clear unused memblks. |
291 | * | ||
292 | * RETURNS: | ||
293 | * 0 on success, -errno on failure. | ||
243 | */ | 294 | */ |
244 | void __init numa_init_array(void) | 295 | int __init numa_cleanup_meminfo(struct numa_meminfo *mi) |
245 | { | 296 | { |
246 | int rr, i; | 297 | const u64 low = 0; |
298 | const u64 high = (u64)max_pfn << PAGE_SHIFT; | ||
299 | int i, j, k; | ||
247 | 300 | ||
248 | rr = first_node(node_online_map); | 301 | for (i = 0; i < mi->nr_blks; i++) { |
249 | for (i = 0; i < nr_cpu_ids; i++) { | 302 | struct numa_memblk *bi = &mi->blk[i]; |
250 | if (early_cpu_to_node(i) != NUMA_NO_NODE) | ||
251 | continue; | ||
252 | numa_set_node(i, rr); | ||
253 | rr = next_node(rr, node_online_map); | ||
254 | if (rr == MAX_NUMNODES) | ||
255 | rr = first_node(node_online_map); | ||
256 | } | ||
257 | } | ||
258 | |||
259 | #ifdef CONFIG_NUMA_EMU | ||
260 | /* Numa emulation */ | ||
261 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | ||
262 | static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata; | ||
263 | static char *cmdline __initdata; | ||
264 | 303 | ||
265 | void __init numa_emu_cmdline(char *str) | 304 | /* make sure all blocks are inside the limits */ |
266 | { | 305 | bi->start = max(bi->start, low); |
267 | cmdline = str; | 306 | bi->end = min(bi->end, high); |
268 | } | ||
269 | 307 | ||
270 | static int __init setup_physnodes(unsigned long start, unsigned long end, | 308 | /* and there's no empty block */ |
271 | int acpi, int amd) | 309 | if (bi->start == bi->end) { |
272 | { | 310 | numa_remove_memblk_from(i--, mi); |
273 | int ret = 0; | ||
274 | int i; | ||
275 | |||
276 | memset(physnodes, 0, sizeof(physnodes)); | ||
277 | #ifdef CONFIG_ACPI_NUMA | ||
278 | if (acpi) | ||
279 | acpi_get_nodes(physnodes, start, end); | ||
280 | #endif | ||
281 | #ifdef CONFIG_AMD_NUMA | ||
282 | if (amd) | ||
283 | amd_get_nodes(physnodes); | ||
284 | #endif | ||
285 | /* | ||
286 | * Basic sanity checking on the physical node map: there may be errors | ||
287 | * if the SRAT or AMD code incorrectly reported the topology or the mem= | ||
288 | * kernel parameter is used. | ||
289 | */ | ||
290 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
291 | if (physnodes[i].start == physnodes[i].end) | ||
292 | continue; | ||
293 | if (physnodes[i].start > end) { | ||
294 | physnodes[i].end = physnodes[i].start; | ||
295 | continue; | ||
296 | } | ||
297 | if (physnodes[i].end < start) { | ||
298 | physnodes[i].start = physnodes[i].end; | ||
299 | continue; | 311 | continue; |
300 | } | 312 | } |
301 | if (physnodes[i].start < start) | ||
302 | physnodes[i].start = start; | ||
303 | if (physnodes[i].end > end) | ||
304 | physnodes[i].end = end; | ||
305 | ret++; | ||
306 | } | ||
307 | 313 | ||
308 | /* | 314 | for (j = i + 1; j < mi->nr_blks; j++) { |
309 | * If no physical topology was detected, a single node is faked to cover | 315 | struct numa_memblk *bj = &mi->blk[j]; |
310 | * the entire address space. | 316 | unsigned long start, end; |
311 | */ | ||
312 | if (!ret) { | ||
313 | physnodes[ret].start = start; | ||
314 | physnodes[ret].end = end; | ||
315 | ret = 1; | ||
316 | } | ||
317 | return ret; | ||
318 | } | ||
319 | |||
320 | static void __init fake_physnodes(int acpi, int amd, int nr_nodes) | ||
321 | { | ||
322 | int i; | ||
323 | |||
324 | BUG_ON(acpi && amd); | ||
325 | #ifdef CONFIG_ACPI_NUMA | ||
326 | if (acpi) | ||
327 | acpi_fake_nodes(nodes, nr_nodes); | ||
328 | #endif | ||
329 | #ifdef CONFIG_AMD_NUMA | ||
330 | if (amd) | ||
331 | amd_fake_nodes(nodes, nr_nodes); | ||
332 | #endif | ||
333 | if (!acpi && !amd) | ||
334 | for (i = 0; i < nr_cpu_ids; i++) | ||
335 | numa_set_node(i, 0); | ||
336 | } | ||
337 | |||
338 | /* | ||
339 | * Setups up nid to range from addr to addr + size. If the end | ||
340 | * boundary is greater than max_addr, then max_addr is used instead. | ||
341 | * The return value is 0 if there is additional memory left for | ||
342 | * allocation past addr and -1 otherwise. addr is adjusted to be at | ||
343 | * the end of the node. | ||
344 | */ | ||
345 | static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) | ||
346 | { | ||
347 | int ret = 0; | ||
348 | nodes[nid].start = *addr; | ||
349 | *addr += size; | ||
350 | if (*addr >= max_addr) { | ||
351 | *addr = max_addr; | ||
352 | ret = -1; | ||
353 | } | ||
354 | nodes[nid].end = *addr; | ||
355 | node_set(nid, node_possible_map); | ||
356 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, | ||
357 | nodes[nid].start, nodes[nid].end, | ||
358 | (nodes[nid].end - nodes[nid].start) >> 20); | ||
359 | return ret; | ||
360 | } | ||
361 | |||
362 | /* | ||
363 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr | ||
364 | * to max_addr. The return value is the number of nodes allocated. | ||
365 | */ | ||
366 | static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) | ||
367 | { | ||
368 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
369 | u64 size; | ||
370 | int big; | ||
371 | int ret = 0; | ||
372 | int i; | ||
373 | |||
374 | if (nr_nodes <= 0) | ||
375 | return -1; | ||
376 | if (nr_nodes > MAX_NUMNODES) { | ||
377 | pr_info("numa=fake=%d too large, reducing to %d\n", | ||
378 | nr_nodes, MAX_NUMNODES); | ||
379 | nr_nodes = MAX_NUMNODES; | ||
380 | } | ||
381 | |||
382 | size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; | ||
383 | /* | ||
384 | * Calculate the number of big nodes that can be allocated as a result | ||
385 | * of consolidating the remainder. | ||
386 | */ | ||
387 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / | ||
388 | FAKE_NODE_MIN_SIZE; | ||
389 | |||
390 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
391 | if (!size) { | ||
392 | pr_err("Not enough memory for each node. " | ||
393 | "NUMA emulation disabled.\n"); | ||
394 | return -1; | ||
395 | } | ||
396 | |||
397 | for (i = 0; i < MAX_NUMNODES; i++) | ||
398 | if (physnodes[i].start != physnodes[i].end) | ||
399 | node_set(i, physnode_mask); | ||
400 | |||
401 | /* | ||
402 | * Continue to fill physical nodes with fake nodes until there is no | ||
403 | * memory left on any of them. | ||
404 | */ | ||
405 | while (nodes_weight(physnode_mask)) { | ||
406 | for_each_node_mask(i, physnode_mask) { | ||
407 | u64 end = physnodes[i].start + size; | ||
408 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); | ||
409 | |||
410 | if (ret < big) | ||
411 | end += FAKE_NODE_MIN_SIZE; | ||
412 | 317 | ||
413 | /* | 318 | /* |
414 | * Continue to add memory to this fake node if its | 319 | * See whether there are overlapping blocks. Whine |
415 | * non-reserved memory is less than the per-node size. | 320 | * about but allow overlaps of the same nid. They |
321 | * will be merged below. | ||
416 | */ | 322 | */ |
417 | while (end - physnodes[i].start - | 323 | if (bi->end > bj->start && bi->start < bj->end) { |
418 | memblock_x86_hole_size(physnodes[i].start, end) < size) { | 324 | if (bi->nid != bj->nid) { |
419 | end += FAKE_NODE_MIN_SIZE; | 325 | pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", |
420 | if (end > physnodes[i].end) { | 326 | bi->nid, bi->start, bi->end, |
421 | end = physnodes[i].end; | 327 | bj->nid, bj->start, bj->end); |
422 | break; | 328 | return -EINVAL; |
423 | } | 329 | } |
330 | pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", | ||
331 | bi->nid, bi->start, bi->end, | ||
332 | bj->start, bj->end); | ||
424 | } | 333 | } |
425 | 334 | ||
426 | /* | 335 | /* |
427 | * If there won't be at least FAKE_NODE_MIN_SIZE of | 336 | * Join together blocks on the same node, holes |
428 | * non-reserved memory in ZONE_DMA32 for the next node, | 337 | * between which don't overlap with memory on other |
429 | * this one must extend to the boundary. | 338 | * nodes. |
430 | */ | ||
431 | if (end < dma32_end && dma32_end - end - | ||
432 | memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
433 | end = dma32_end; | ||
434 | |||
435 | /* | ||
436 | * If there won't be enough non-reserved memory for the | ||
437 | * next node, this one must extend to the end of the | ||
438 | * physical node. | ||
439 | */ | 339 | */ |
440 | if (physnodes[i].end - end - | 340 | if (bi->nid != bj->nid) |
441 | memblock_x86_hole_size(end, physnodes[i].end) < size) | 341 | continue; |
442 | end = physnodes[i].end; | 342 | start = max(min(bi->start, bj->start), low); |
443 | 343 | end = min(max(bi->end, bj->end), high); | |
444 | /* | 344 | for (k = 0; k < mi->nr_blks; k++) { |
445 | * Avoid allocating more nodes than requested, which can | 345 | struct numa_memblk *bk = &mi->blk[k]; |
446 | * happen as a result of rounding down each node's size | 346 | |
447 | * to FAKE_NODE_MIN_SIZE. | 347 | if (bi->nid == bk->nid) |
448 | */ | 348 | continue; |
449 | if (nodes_weight(physnode_mask) + ret >= nr_nodes) | 349 | if (start < bk->end && end > bk->start) |
450 | end = physnodes[i].end; | 350 | break; |
451 | 351 | } | |
452 | if (setup_node_range(ret++, &physnodes[i].start, | 352 | if (k < mi->nr_blks) |
453 | end - physnodes[i].start, | 353 | continue; |
454 | physnodes[i].end) < 0) | 354 | printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", |
455 | node_clear(i, physnode_mask); | 355 | bi->nid, bi->start, bi->end, bj->start, bj->end, |
356 | start, end); | ||
357 | bi->start = start; | ||
358 | bi->end = end; | ||
359 | numa_remove_memblk_from(j--, mi); | ||
456 | } | 360 | } |
457 | } | 361 | } |
458 | return ret; | ||
459 | } | ||
460 | |||
461 | /* | ||
462 | * Returns the end address of a node so that there is at least `size' amount of | ||
463 | * non-reserved memory or `max_addr' is reached. | ||
464 | */ | ||
465 | static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) | ||
466 | { | ||
467 | u64 end = start + size; | ||
468 | 362 | ||
469 | while (end - start - memblock_x86_hole_size(start, end) < size) { | 363 | for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { |
470 | end += FAKE_NODE_MIN_SIZE; | 364 | mi->blk[i].start = mi->blk[i].end = 0; |
471 | if (end > max_addr) { | 365 | mi->blk[i].nid = NUMA_NO_NODE; |
472 | end = max_addr; | ||
473 | break; | ||
474 | } | ||
475 | } | 366 | } |
476 | return end; | 367 | |
368 | return 0; | ||
477 | } | 369 | } |
478 | 370 | ||
479 | /* | 371 | /* |
480 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from | 372 | * Set nodes, which have memory in @mi, in *@nodemask. |
481 | * `addr' to `max_addr'. The return value is the number of nodes allocated. | ||
482 | */ | 373 | */ |
483 | static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) | 374 | static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, |
375 | const struct numa_meminfo *mi) | ||
484 | { | 376 | { |
485 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
486 | u64 min_size; | ||
487 | int ret = 0; | ||
488 | int i; | 377 | int i; |
489 | 378 | ||
490 | if (!size) | 379 | for (i = 0; i < ARRAY_SIZE(mi->blk); i++) |
491 | return -1; | 380 | if (mi->blk[i].start != mi->blk[i].end && |
492 | /* | 381 | mi->blk[i].nid != NUMA_NO_NODE) |
493 | * The limit on emulated nodes is MAX_NUMNODES, so the size per node is | 382 | node_set(mi->blk[i].nid, *nodemask); |
494 | * increased accordingly if the requested size is too small. This | 383 | } |
495 | * creates a uniform distribution of node sizes across the entire | ||
496 | * machine (but not necessarily over physical nodes). | ||
497 | */ | ||
498 | min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / | ||
499 | MAX_NUMNODES; | ||
500 | min_size = max(min_size, FAKE_NODE_MIN_SIZE); | ||
501 | if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) | ||
502 | min_size = (min_size + FAKE_NODE_MIN_SIZE) & | ||
503 | FAKE_NODE_MIN_HASH_MASK; | ||
504 | if (size < min_size) { | ||
505 | pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", | ||
506 | size >> 20, min_size >> 20); | ||
507 | size = min_size; | ||
508 | } | ||
509 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
510 | |||
511 | for (i = 0; i < MAX_NUMNODES; i++) | ||
512 | if (physnodes[i].start != physnodes[i].end) | ||
513 | node_set(i, physnode_mask); | ||
514 | /* | ||
515 | * Fill physical nodes with fake nodes of size until there is no memory | ||
516 | * left on any of them. | ||
517 | */ | ||
518 | while (nodes_weight(physnode_mask)) { | ||
519 | for_each_node_mask(i, physnode_mask) { | ||
520 | u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; | ||
521 | u64 end; | ||
522 | |||
523 | end = find_end_of_node(physnodes[i].start, | ||
524 | physnodes[i].end, size); | ||
525 | /* | ||
526 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
527 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
528 | * this one must extend to the boundary. | ||
529 | */ | ||
530 | if (end < dma32_end && dma32_end - end - | ||
531 | memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
532 | end = dma32_end; | ||
533 | 384 | ||
534 | /* | 385 | /** |
535 | * If there won't be enough non-reserved memory for the | 386 | * numa_reset_distance - Reset NUMA distance table |
536 | * next node, this one must extend to the end of the | 387 | * |
537 | * physical node. | 388 | * The current table is freed. The next numa_set_distance() call will |
538 | */ | 389 | * create a new one. |
539 | if (physnodes[i].end - end - | 390 | */ |
540 | memblock_x86_hole_size(end, physnodes[i].end) < size) | 391 | void __init numa_reset_distance(void) |
541 | end = physnodes[i].end; | 392 | { |
393 | size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); | ||
542 | 394 | ||
543 | /* | 395 | /* numa_distance could be 1LU marking allocation failure, test cnt */ |
544 | * Setup the fake node that will be allocated as bootmem | 396 | if (numa_distance_cnt) |
545 | * later. If setup_node_range() returns non-zero, there | 397 | memblock_x86_free_range(__pa(numa_distance), |
546 | * is no more memory available on this physical node. | 398 | __pa(numa_distance) + size); |
547 | */ | 399 | numa_distance_cnt = 0; |
548 | if (setup_node_range(ret++, &physnodes[i].start, | 400 | numa_distance = NULL; /* enable table creation */ |
549 | end - physnodes[i].start, | ||
550 | physnodes[i].end) < 0) | ||
551 | node_clear(i, physnode_mask); | ||
552 | } | ||
553 | } | ||
554 | return ret; | ||
555 | } | 401 | } |
556 | 402 | ||
557 | /* | 403 | static int __init numa_alloc_distance(void) |
558 | * Sets up the system RAM area from start_pfn to last_pfn according to the | ||
559 | * numa=fake command-line option. | ||
560 | */ | ||
561 | static int __init numa_emulation(unsigned long start_pfn, | ||
562 | unsigned long last_pfn, int acpi, int amd) | ||
563 | { | 404 | { |
564 | u64 addr = start_pfn << PAGE_SHIFT; | 405 | nodemask_t nodes_parsed; |
565 | u64 max_addr = last_pfn << PAGE_SHIFT; | 406 | size_t size; |
566 | int num_nodes; | 407 | int i, j, cnt = 0; |
567 | int i; | 408 | u64 phys; |
568 | 409 | ||
569 | /* | 410 | /* size the new table and allocate it */ |
570 | * If the numa=fake command-line contains a 'M' or 'G', it represents | 411 | nodes_parsed = numa_nodes_parsed; |
571 | * the fixed node size. Otherwise, if it is just a single number N, | 412 | numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); |
572 | * split the system RAM into N fake nodes. | ||
573 | */ | ||
574 | if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { | ||
575 | u64 size; | ||
576 | 413 | ||
577 | size = memparse(cmdline, &cmdline); | 414 | for_each_node_mask(i, nodes_parsed) |
578 | num_nodes = split_nodes_size_interleave(addr, max_addr, size); | 415 | cnt = i; |
579 | } else { | 416 | cnt++; |
580 | unsigned long n; | 417 | size = cnt * cnt * sizeof(numa_distance[0]); |
581 | 418 | ||
582 | n = simple_strtoul(cmdline, NULL, 0); | 419 | phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT, |
583 | num_nodes = split_nodes_interleave(addr, max_addr, n); | 420 | size, PAGE_SIZE); |
421 | if (phys == MEMBLOCK_ERROR) { | ||
422 | pr_warning("NUMA: Warning: can't allocate distance table!\n"); | ||
423 | /* don't retry until explicitly reset */ | ||
424 | numa_distance = (void *)1LU; | ||
425 | return -ENOMEM; | ||
584 | } | 426 | } |
427 | memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); | ||
585 | 428 | ||
586 | if (num_nodes < 0) | 429 | numa_distance = __va(phys); |
587 | return num_nodes; | 430 | numa_distance_cnt = cnt; |
588 | memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); | 431 | |
589 | if (memnode_shift < 0) { | 432 | /* fill with the default distances */ |
590 | memnode_shift = 0; | 433 | for (i = 0; i < cnt; i++) |
591 | printk(KERN_ERR "No NUMA hash function found. NUMA emulation " | 434 | for (j = 0; j < cnt; j++) |
592 | "disabled.\n"); | 435 | numa_distance[i * cnt + j] = i == j ? |
593 | return -1; | 436 | LOCAL_DISTANCE : REMOTE_DISTANCE; |
594 | } | 437 | printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); |
595 | 438 | ||
596 | /* | ||
597 | * We need to vacate all active ranges that may have been registered for | ||
598 | * the e820 memory map. | ||
599 | */ | ||
600 | remove_all_active_ranges(); | ||
601 | for_each_node_mask(i, node_possible_map) { | ||
602 | memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, | ||
603 | nodes[i].end >> PAGE_SHIFT); | ||
604 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
605 | } | ||
606 | setup_physnodes(addr, max_addr, acpi, amd); | ||
607 | fake_physnodes(acpi, amd, num_nodes); | ||
608 | numa_init_array(); | ||
609 | return 0; | 439 | return 0; |
610 | } | 440 | } |
611 | #endif /* CONFIG_NUMA_EMU */ | ||
612 | 441 | ||
613 | void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, | 442 | /** |
614 | int acpi, int amd) | 443 | * numa_set_distance - Set NUMA distance from one NUMA to another |
444 | * @from: the 'from' node to set distance | ||
445 | * @to: the 'to' node to set distance | ||
446 | * @distance: NUMA distance | ||
447 | * | ||
448 | * Set the distance from node @from to @to to @distance. If distance table | ||
449 | * doesn't exist, one which is large enough to accomodate all the currently | ||
450 | * known nodes will be created. | ||
451 | * | ||
452 | * If such table cannot be allocated, a warning is printed and further | ||
453 | * calls are ignored until the distance table is reset with | ||
454 | * numa_reset_distance(). | ||
455 | * | ||
456 | * If @from or @to is higher than the highest known node at the time of | ||
457 | * table creation or @distance doesn't make sense, the call is ignored. | ||
458 | * This is to allow simplification of specific NUMA config implementations. | ||
459 | */ | ||
460 | void __init numa_set_distance(int from, int to, int distance) | ||
615 | { | 461 | { |
616 | int i; | 462 | if (!numa_distance && numa_alloc_distance() < 0) |
617 | |||
618 | nodes_clear(node_possible_map); | ||
619 | nodes_clear(node_online_map); | ||
620 | |||
621 | #ifdef CONFIG_NUMA_EMU | ||
622 | setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, | ||
623 | acpi, amd); | ||
624 | if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) | ||
625 | return; | 463 | return; |
626 | setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, | ||
627 | acpi, amd); | ||
628 | nodes_clear(node_possible_map); | ||
629 | nodes_clear(node_online_map); | ||
630 | #endif | ||
631 | 464 | ||
632 | #ifdef CONFIG_ACPI_NUMA | 465 | if (from >= numa_distance_cnt || to >= numa_distance_cnt) { |
633 | if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, | 466 | printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", |
634 | last_pfn << PAGE_SHIFT)) | 467 | from, to, distance); |
635 | return; | 468 | return; |
636 | nodes_clear(node_possible_map); | 469 | } |
637 | nodes_clear(node_online_map); | ||
638 | #endif | ||
639 | 470 | ||
640 | #ifdef CONFIG_AMD_NUMA | 471 | if ((u8)distance != distance || |
641 | if (!numa_off && amd && !amd_scan_nodes()) | 472 | (from == to && distance != LOCAL_DISTANCE)) { |
473 | pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", | ||
474 | from, to, distance); | ||
642 | return; | 475 | return; |
643 | nodes_clear(node_possible_map); | 476 | } |
644 | nodes_clear(node_online_map); | ||
645 | #endif | ||
646 | printk(KERN_INFO "%s\n", | ||
647 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); | ||
648 | 477 | ||
649 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", | 478 | numa_distance[from * numa_distance_cnt + to] = distance; |
650 | start_pfn << PAGE_SHIFT, | ||
651 | last_pfn << PAGE_SHIFT); | ||
652 | /* setup dummy node covering all memory */ | ||
653 | memnode_shift = 63; | ||
654 | memnodemap = memnode.embedded_map; | ||
655 | memnodemap[0] = 0; | ||
656 | node_set_online(0); | ||
657 | node_set(0, node_possible_map); | ||
658 | for (i = 0; i < nr_cpu_ids; i++) | ||
659 | numa_set_node(i, 0); | ||
660 | memblock_x86_register_active_regions(0, start_pfn, last_pfn); | ||
661 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); | ||
662 | } | 479 | } |
663 | 480 | ||
664 | unsigned long __init numa_free_all_bootmem(void) | 481 | int __node_distance(int from, int to) |
665 | { | 482 | { |
666 | unsigned long pages = 0; | 483 | if (from >= numa_distance_cnt || to >= numa_distance_cnt) |
667 | int i; | 484 | return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; |
485 | return numa_distance[from * numa_distance_cnt + to]; | ||
486 | } | ||
487 | EXPORT_SYMBOL(__node_distance); | ||
668 | 488 | ||
669 | for_each_online_node(i) | 489 | /* |
670 | pages += free_all_bootmem_node(NODE_DATA(i)); | 490 | * Sanity check to catch more bad NUMA configurations (they are amazingly |
491 | * common). Make sure the nodes cover all memory. | ||
492 | */ | ||
493 | static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) | ||
494 | { | ||
495 | unsigned long numaram, e820ram; | ||
496 | int i; | ||
671 | 497 | ||
672 | pages += free_all_memory_core_early(MAX_NUMNODES); | 498 | numaram = 0; |
499 | for (i = 0; i < mi->nr_blks; i++) { | ||
500 | unsigned long s = mi->blk[i].start >> PAGE_SHIFT; | ||
501 | unsigned long e = mi->blk[i].end >> PAGE_SHIFT; | ||
502 | numaram += e - s; | ||
503 | numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); | ||
504 | if ((long)numaram < 0) | ||
505 | numaram = 0; | ||
506 | } | ||
673 | 507 | ||
674 | return pages; | 508 | e820ram = max_pfn - (memblock_x86_hole_size(0, |
509 | max_pfn << PAGE_SHIFT) >> PAGE_SHIFT); | ||
510 | /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ | ||
511 | if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { | ||
512 | printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", | ||
513 | (numaram << PAGE_SHIFT) >> 20, | ||
514 | (e820ram << PAGE_SHIFT) >> 20); | ||
515 | return false; | ||
516 | } | ||
517 | return true; | ||
675 | } | 518 | } |
676 | 519 | ||
677 | #ifdef CONFIG_NUMA | 520 | static int __init numa_register_memblks(struct numa_meminfo *mi) |
678 | |||
679 | static __init int find_near_online_node(int node) | ||
680 | { | 521 | { |
681 | int n, val; | 522 | int i, nid; |
682 | int min_val = INT_MAX; | ||
683 | int best_node = -1; | ||
684 | 523 | ||
685 | for_each_online_node(n) { | 524 | /* Account for nodes with cpus and no memory */ |
686 | val = node_distance(node, n); | 525 | node_possible_map = numa_nodes_parsed; |
526 | numa_nodemask_from_meminfo(&node_possible_map, mi); | ||
527 | if (WARN_ON(nodes_empty(node_possible_map))) | ||
528 | return -EINVAL; | ||
687 | 529 | ||
688 | if (val < min_val) { | 530 | memnode_shift = compute_hash_shift(mi); |
689 | min_val = val; | 531 | if (memnode_shift < 0) { |
690 | best_node = n; | 532 | printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n"); |
533 | return -EINVAL; | ||
534 | } | ||
535 | |||
536 | for (i = 0; i < mi->nr_blks; i++) | ||
537 | memblock_x86_register_active_regions(mi->blk[i].nid, | ||
538 | mi->blk[i].start >> PAGE_SHIFT, | ||
539 | mi->blk[i].end >> PAGE_SHIFT); | ||
540 | |||
541 | /* for out of order entries */ | ||
542 | sort_node_map(); | ||
543 | if (!numa_meminfo_cover_memory(mi)) | ||
544 | return -EINVAL; | ||
545 | |||
546 | /* Finally register nodes. */ | ||
547 | for_each_node_mask(nid, node_possible_map) { | ||
548 | u64 start = (u64)max_pfn << PAGE_SHIFT; | ||
549 | u64 end = 0; | ||
550 | |||
551 | for (i = 0; i < mi->nr_blks; i++) { | ||
552 | if (nid != mi->blk[i].nid) | ||
553 | continue; | ||
554 | start = min(mi->blk[i].start, start); | ||
555 | end = max(mi->blk[i].end, end); | ||
691 | } | 556 | } |
557 | |||
558 | if (start < end) | ||
559 | setup_node_bootmem(nid, start, end); | ||
692 | } | 560 | } |
693 | 561 | ||
694 | return best_node; | 562 | return 0; |
695 | } | 563 | } |
696 | 564 | ||
697 | /* | 565 | /** |
698 | * Setup early cpu_to_node. | 566 | * dummy_numma_init - Fallback dummy NUMA init |
699 | * | 567 | * |
700 | * Populate cpu_to_node[] only if x86_cpu_to_apicid[], | 568 | * Used if there's no underlying NUMA architecture, NUMA initialization |
701 | * and apicid_to_node[] tables have valid entries for a CPU. | 569 | * fails, or NUMA is disabled on the command line. |
702 | * This means we skip cpu_to_node[] initialisation for NUMA | ||
703 | * emulation and faking node case (when running a kernel compiled | ||
704 | * for NUMA on a non NUMA box), which is OK as cpu_to_node[] | ||
705 | * is already initialized in a round robin manner at numa_init_array, | ||
706 | * prior to this call, and this initialization is good enough | ||
707 | * for the fake NUMA cases. | ||
708 | * | 570 | * |
709 | * Called before the per_cpu areas are setup. | 571 | * Must online at least one node and add memory blocks that cover all |
572 | * allowed memory. This function must not fail. | ||
710 | */ | 573 | */ |
711 | void __init init_cpu_to_node(void) | 574 | static int __init dummy_numa_init(void) |
712 | { | 575 | { |
713 | int cpu; | 576 | printk(KERN_INFO "%s\n", |
714 | u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); | 577 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); |
715 | 578 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", | |
716 | BUG_ON(cpu_to_apicid == NULL); | 579 | 0LU, max_pfn << PAGE_SHIFT); |
717 | 580 | ||
718 | for_each_possible_cpu(cpu) { | 581 | node_set(0, numa_nodes_parsed); |
719 | int node; | 582 | numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); |
720 | u16 apicid = cpu_to_apicid[cpu]; | ||
721 | 583 | ||
722 | if (apicid == BAD_APICID) | 584 | return 0; |
723 | continue; | ||
724 | node = apicid_to_node[apicid]; | ||
725 | if (node == NUMA_NO_NODE) | ||
726 | continue; | ||
727 | if (!node_online(node)) | ||
728 | node = find_near_online_node(node); | ||
729 | numa_set_node(cpu, node); | ||
730 | } | ||
731 | } | 585 | } |
732 | #endif | ||
733 | 586 | ||
734 | 587 | static int __init numa_init(int (*init_func)(void)) | |
735 | void __cpuinit numa_set_node(int cpu, int node) | ||
736 | { | 588 | { |
737 | int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); | 589 | int i; |
738 | 590 | int ret; | |
739 | /* early setting, no percpu area yet */ | ||
740 | if (cpu_to_node_map) { | ||
741 | cpu_to_node_map[cpu] = node; | ||
742 | return; | ||
743 | } | ||
744 | |||
745 | #ifdef CONFIG_DEBUG_PER_CPU_MAPS | ||
746 | if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { | ||
747 | printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); | ||
748 | dump_stack(); | ||
749 | return; | ||
750 | } | ||
751 | #endif | ||
752 | per_cpu(x86_cpu_to_node_map, cpu) = node; | ||
753 | 591 | ||
754 | if (node != NUMA_NO_NODE) | 592 | for (i = 0; i < MAX_LOCAL_APIC; i++) |
755 | set_cpu_numa_node(cpu, node); | 593 | set_apicid_to_node(i, NUMA_NO_NODE); |
756 | } | ||
757 | 594 | ||
758 | void __cpuinit numa_clear_node(int cpu) | 595 | nodes_clear(numa_nodes_parsed); |
759 | { | 596 | nodes_clear(node_possible_map); |
760 | numa_set_node(cpu, NUMA_NO_NODE); | 597 | nodes_clear(node_online_map); |
761 | } | 598 | memset(&numa_meminfo, 0, sizeof(numa_meminfo)); |
762 | 599 | remove_all_active_ranges(); | |
763 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS | 600 | numa_reset_distance(); |
764 | 601 | ||
765 | #ifndef CONFIG_NUMA_EMU | 602 | ret = init_func(); |
766 | void __cpuinit numa_add_cpu(int cpu) | 603 | if (ret < 0) |
767 | { | 604 | return ret; |
768 | cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | 605 | ret = numa_cleanup_meminfo(&numa_meminfo); |
769 | } | 606 | if (ret < 0) |
607 | return ret; | ||
770 | 608 | ||
771 | void __cpuinit numa_remove_cpu(int cpu) | 609 | numa_emulation(&numa_meminfo, numa_distance_cnt); |
772 | { | ||
773 | cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | ||
774 | } | ||
775 | #else | ||
776 | void __cpuinit numa_add_cpu(int cpu) | ||
777 | { | ||
778 | unsigned long addr; | ||
779 | u16 apicid; | ||
780 | int physnid; | ||
781 | int nid = NUMA_NO_NODE; | ||
782 | 610 | ||
783 | nid = early_cpu_to_node(cpu); | 611 | ret = numa_register_memblks(&numa_meminfo); |
784 | BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); | 612 | if (ret < 0) |
613 | return ret; | ||
785 | 614 | ||
786 | /* | 615 | for (i = 0; i < nr_cpu_ids; i++) { |
787 | * Use the starting address of the emulated node to find which physical | 616 | int nid = early_cpu_to_node(i); |
788 | * node it is allocated on. | ||
789 | */ | ||
790 | addr = node_start_pfn(nid) << PAGE_SHIFT; | ||
791 | for (physnid = 0; physnid < MAX_NUMNODES; physnid++) | ||
792 | if (addr >= physnodes[physnid].start && | ||
793 | addr < physnodes[physnid].end) | ||
794 | break; | ||
795 | 617 | ||
796 | /* | 618 | if (nid == NUMA_NO_NODE) |
797 | * Map the cpu to each emulated node that is allocated on the physical | 619 | continue; |
798 | * node of the cpu's apic id. | 620 | if (!node_online(nid)) |
799 | */ | 621 | numa_clear_node(i); |
800 | for_each_online_node(nid) { | ||
801 | addr = node_start_pfn(nid) << PAGE_SHIFT; | ||
802 | if (addr >= physnodes[physnid].start && | ||
803 | addr < physnodes[physnid].end) | ||
804 | cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); | ||
805 | } | 622 | } |
623 | numa_init_array(); | ||
624 | return 0; | ||
806 | } | 625 | } |
807 | 626 | ||
808 | void __cpuinit numa_remove_cpu(int cpu) | 627 | void __init initmem_init(void) |
809 | { | 628 | { |
810 | int i; | 629 | int ret; |
811 | 630 | ||
812 | for_each_online_node(i) | 631 | if (!numa_off) { |
813 | cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); | 632 | #ifdef CONFIG_ACPI_NUMA |
814 | } | 633 | ret = numa_init(x86_acpi_numa_init); |
815 | #endif /* !CONFIG_NUMA_EMU */ | 634 | if (!ret) |
816 | 635 | return; | |
817 | #else /* CONFIG_DEBUG_PER_CPU_MAPS */ | 636 | #endif |
818 | static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) | 637 | #ifdef CONFIG_AMD_NUMA |
819 | { | 638 | ret = numa_init(amd_numa_init); |
820 | int node = early_cpu_to_node(cpu); | 639 | if (!ret) |
821 | struct cpumask *mask; | 640 | return; |
822 | char buf[64]; | 641 | #endif |
823 | |||
824 | mask = node_to_cpumask_map[node]; | ||
825 | if (!mask) { | ||
826 | pr_err("node_to_cpumask_map[%i] NULL\n", node); | ||
827 | dump_stack(); | ||
828 | return NULL; | ||
829 | } | 642 | } |
830 | 643 | ||
831 | cpulist_scnprintf(buf, sizeof(buf), mask); | 644 | numa_init(dummy_numa_init); |
832 | printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", | ||
833 | enable ? "numa_add_cpu" : "numa_remove_cpu", | ||
834 | cpu, node, buf); | ||
835 | return mask; | ||
836 | } | 645 | } |
837 | 646 | ||
838 | /* | 647 | unsigned long __init numa_free_all_bootmem(void) |
839 | * --------- debug versions of the numa functions --------- | ||
840 | */ | ||
841 | #ifndef CONFIG_NUMA_EMU | ||
842 | static void __cpuinit numa_set_cpumask(int cpu, int enable) | ||
843 | { | ||
844 | struct cpumask *mask; | ||
845 | |||
846 | mask = debug_cpumask_set_cpu(cpu, enable); | ||
847 | if (!mask) | ||
848 | return; | ||
849 | |||
850 | if (enable) | ||
851 | cpumask_set_cpu(cpu, mask); | ||
852 | else | ||
853 | cpumask_clear_cpu(cpu, mask); | ||
854 | } | ||
855 | #else | ||
856 | static void __cpuinit numa_set_cpumask(int cpu, int enable) | ||
857 | { | 648 | { |
858 | int node = early_cpu_to_node(cpu); | 649 | unsigned long pages = 0; |
859 | struct cpumask *mask; | ||
860 | int i; | 650 | int i; |
861 | 651 | ||
862 | for_each_online_node(i) { | 652 | for_each_online_node(i) |
863 | unsigned long addr; | 653 | pages += free_all_bootmem_node(NODE_DATA(i)); |
864 | |||
865 | addr = node_start_pfn(i) << PAGE_SHIFT; | ||
866 | if (addr < physnodes[node].start || | ||
867 | addr >= physnodes[node].end) | ||
868 | continue; | ||
869 | mask = debug_cpumask_set_cpu(cpu, enable); | ||
870 | if (!mask) | ||
871 | return; | ||
872 | |||
873 | if (enable) | ||
874 | cpumask_set_cpu(cpu, mask); | ||
875 | else | ||
876 | cpumask_clear_cpu(cpu, mask); | ||
877 | } | ||
878 | } | ||
879 | #endif /* CONFIG_NUMA_EMU */ | ||
880 | 654 | ||
881 | void __cpuinit numa_add_cpu(int cpu) | 655 | pages += free_all_memory_core_early(MAX_NUMNODES); |
882 | { | ||
883 | numa_set_cpumask(cpu, 1); | ||
884 | } | ||
885 | 656 | ||
886 | void __cpuinit numa_remove_cpu(int cpu) | 657 | return pages; |
887 | { | ||
888 | numa_set_cpumask(cpu, 0); | ||
889 | } | 658 | } |
890 | 659 | ||
891 | int __cpu_to_node(int cpu) | 660 | int __cpuinit numa_cpu_node(int cpu) |
892 | { | 661 | { |
893 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) { | 662 | int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); |
894 | printk(KERN_WARNING | ||
895 | "cpu_to_node(%d): usage too early!\n", cpu); | ||
896 | dump_stack(); | ||
897 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | ||
898 | } | ||
899 | return per_cpu(x86_cpu_to_node_map, cpu); | ||
900 | } | ||
901 | EXPORT_SYMBOL(__cpu_to_node); | ||
902 | 663 | ||
903 | /* | 664 | if (apicid != BAD_APICID) |
904 | * Same function as cpu_to_node() but used if called before the | 665 | return __apicid_to_node[apicid]; |
905 | * per_cpu areas are setup. | 666 | return NUMA_NO_NODE; |
906 | */ | ||
907 | int early_cpu_to_node(int cpu) | ||
908 | { | ||
909 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) | ||
910 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | ||
911 | |||
912 | if (!cpu_possible(cpu)) { | ||
913 | printk(KERN_WARNING | ||
914 | "early_cpu_to_node(%d): no per_cpu area!\n", cpu); | ||
915 | dump_stack(); | ||
916 | return NUMA_NO_NODE; | ||
917 | } | ||
918 | return per_cpu(x86_cpu_to_node_map, cpu); | ||
919 | } | 667 | } |
920 | |||
921 | /* | ||
922 | * --------- end of debug versions of the numa functions --------- | ||
923 | */ | ||
924 | |||
925 | #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ | ||
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c new file mode 100644 index 000000000000..ad091e4cff17 --- /dev/null +++ b/arch/x86/mm/numa_emulation.c | |||
@@ -0,0 +1,494 @@ | |||
1 | /* | ||
2 | * NUMA emulation | ||
3 | */ | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/errno.h> | ||
6 | #include <linux/topology.h> | ||
7 | #include <linux/memblock.h> | ||
8 | #include <asm/dma.h> | ||
9 | |||
10 | #include "numa_internal.h" | ||
11 | |||
12 | static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; | ||
13 | static char *emu_cmdline __initdata; | ||
14 | |||
15 | void __init numa_emu_cmdline(char *str) | ||
16 | { | ||
17 | emu_cmdline = str; | ||
18 | } | ||
19 | |||
20 | static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) | ||
21 | { | ||
22 | int i; | ||
23 | |||
24 | for (i = 0; i < mi->nr_blks; i++) | ||
25 | if (mi->blk[i].nid == nid) | ||
26 | return i; | ||
27 | return -ENOENT; | ||
28 | } | ||
29 | |||
30 | /* | ||
31 | * Sets up nid to range from @start to @end. The return value is -errno if | ||
32 | * something went wrong, 0 otherwise. | ||
33 | */ | ||
34 | static int __init emu_setup_memblk(struct numa_meminfo *ei, | ||
35 | struct numa_meminfo *pi, | ||
36 | int nid, int phys_blk, u64 size) | ||
37 | { | ||
38 | struct numa_memblk *eb = &ei->blk[ei->nr_blks]; | ||
39 | struct numa_memblk *pb = &pi->blk[phys_blk]; | ||
40 | |||
41 | if (ei->nr_blks >= NR_NODE_MEMBLKS) { | ||
42 | pr_err("NUMA: Too many emulated memblks, failing emulation\n"); | ||
43 | return -EINVAL; | ||
44 | } | ||
45 | |||
46 | ei->nr_blks++; | ||
47 | eb->start = pb->start; | ||
48 | eb->end = pb->start + size; | ||
49 | eb->nid = nid; | ||
50 | |||
51 | if (emu_nid_to_phys[nid] == NUMA_NO_NODE) | ||
52 | emu_nid_to_phys[nid] = pb->nid; | ||
53 | |||
54 | pb->start += size; | ||
55 | if (pb->start >= pb->end) { | ||
56 | WARN_ON_ONCE(pb->start > pb->end); | ||
57 | numa_remove_memblk_from(phys_blk, pi); | ||
58 | } | ||
59 | |||
60 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, | ||
61 | eb->start, eb->end, (eb->end - eb->start) >> 20); | ||
62 | return 0; | ||
63 | } | ||
64 | |||
65 | /* | ||
66 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr | ||
67 | * to max_addr. The return value is the number of nodes allocated. | ||
68 | */ | ||
69 | static int __init split_nodes_interleave(struct numa_meminfo *ei, | ||
70 | struct numa_meminfo *pi, | ||
71 | u64 addr, u64 max_addr, int nr_nodes) | ||
72 | { | ||
73 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
74 | u64 size; | ||
75 | int big; | ||
76 | int nid = 0; | ||
77 | int i, ret; | ||
78 | |||
79 | if (nr_nodes <= 0) | ||
80 | return -1; | ||
81 | if (nr_nodes > MAX_NUMNODES) { | ||
82 | pr_info("numa=fake=%d too large, reducing to %d\n", | ||
83 | nr_nodes, MAX_NUMNODES); | ||
84 | nr_nodes = MAX_NUMNODES; | ||
85 | } | ||
86 | |||
87 | size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; | ||
88 | /* | ||
89 | * Calculate the number of big nodes that can be allocated as a result | ||
90 | * of consolidating the remainder. | ||
91 | */ | ||
92 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / | ||
93 | FAKE_NODE_MIN_SIZE; | ||
94 | |||
95 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
96 | if (!size) { | ||
97 | pr_err("Not enough memory for each node. " | ||
98 | "NUMA emulation disabled.\n"); | ||
99 | return -1; | ||
100 | } | ||
101 | |||
102 | for (i = 0; i < pi->nr_blks; i++) | ||
103 | node_set(pi->blk[i].nid, physnode_mask); | ||
104 | |||
105 | /* | ||
106 | * Continue to fill physical nodes with fake nodes until there is no | ||
107 | * memory left on any of them. | ||
108 | */ | ||
109 | while (nodes_weight(physnode_mask)) { | ||
110 | for_each_node_mask(i, physnode_mask) { | ||
111 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); | ||
112 | u64 start, limit, end; | ||
113 | int phys_blk; | ||
114 | |||
115 | phys_blk = emu_find_memblk_by_nid(i, pi); | ||
116 | if (phys_blk < 0) { | ||
117 | node_clear(i, physnode_mask); | ||
118 | continue; | ||
119 | } | ||
120 | start = pi->blk[phys_blk].start; | ||
121 | limit = pi->blk[phys_blk].end; | ||
122 | end = start + size; | ||
123 | |||
124 | if (nid < big) | ||
125 | end += FAKE_NODE_MIN_SIZE; | ||
126 | |||
127 | /* | ||
128 | * Continue to add memory to this fake node if its | ||
129 | * non-reserved memory is less than the per-node size. | ||
130 | */ | ||
131 | while (end - start - | ||
132 | memblock_x86_hole_size(start, end) < size) { | ||
133 | end += FAKE_NODE_MIN_SIZE; | ||
134 | if (end > limit) { | ||
135 | end = limit; | ||
136 | break; | ||
137 | } | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
142 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
143 | * this one must extend to the boundary. | ||
144 | */ | ||
145 | if (end < dma32_end && dma32_end - end - | ||
146 | memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
147 | end = dma32_end; | ||
148 | |||
149 | /* | ||
150 | * If there won't be enough non-reserved memory for the | ||
151 | * next node, this one must extend to the end of the | ||
152 | * physical node. | ||
153 | */ | ||
154 | if (limit - end - | ||
155 | memblock_x86_hole_size(end, limit) < size) | ||
156 | end = limit; | ||
157 | |||
158 | ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, | ||
159 | phys_blk, | ||
160 | min(end, limit) - start); | ||
161 | if (ret < 0) | ||
162 | return ret; | ||
163 | } | ||
164 | } | ||
165 | return 0; | ||
166 | } | ||
167 | |||
168 | /* | ||
169 | * Returns the end address of a node so that there is at least `size' amount of | ||
170 | * non-reserved memory or `max_addr' is reached. | ||
171 | */ | ||
172 | static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) | ||
173 | { | ||
174 | u64 end = start + size; | ||
175 | |||
176 | while (end - start - memblock_x86_hole_size(start, end) < size) { | ||
177 | end += FAKE_NODE_MIN_SIZE; | ||
178 | if (end > max_addr) { | ||
179 | end = max_addr; | ||
180 | break; | ||
181 | } | ||
182 | } | ||
183 | return end; | ||
184 | } | ||
185 | |||
186 | /* | ||
187 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from | ||
188 | * `addr' to `max_addr'. The return value is the number of nodes allocated. | ||
189 | */ | ||
190 | static int __init split_nodes_size_interleave(struct numa_meminfo *ei, | ||
191 | struct numa_meminfo *pi, | ||
192 | u64 addr, u64 max_addr, u64 size) | ||
193 | { | ||
194 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
195 | u64 min_size; | ||
196 | int nid = 0; | ||
197 | int i, ret; | ||
198 | |||
199 | if (!size) | ||
200 | return -1; | ||
201 | /* | ||
202 | * The limit on emulated nodes is MAX_NUMNODES, so the size per node is | ||
203 | * increased accordingly if the requested size is too small. This | ||
204 | * creates a uniform distribution of node sizes across the entire | ||
205 | * machine (but not necessarily over physical nodes). | ||
206 | */ | ||
207 | min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / | ||
208 | MAX_NUMNODES; | ||
209 | min_size = max(min_size, FAKE_NODE_MIN_SIZE); | ||
210 | if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) | ||
211 | min_size = (min_size + FAKE_NODE_MIN_SIZE) & | ||
212 | FAKE_NODE_MIN_HASH_MASK; | ||
213 | if (size < min_size) { | ||
214 | pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", | ||
215 | size >> 20, min_size >> 20); | ||
216 | size = min_size; | ||
217 | } | ||
218 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
219 | |||
220 | for (i = 0; i < pi->nr_blks; i++) | ||
221 | node_set(pi->blk[i].nid, physnode_mask); | ||
222 | |||
223 | /* | ||
224 | * Fill physical nodes with fake nodes of size until there is no memory | ||
225 | * left on any of them. | ||
226 | */ | ||
227 | while (nodes_weight(physnode_mask)) { | ||
228 | for_each_node_mask(i, physnode_mask) { | ||
229 | u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; | ||
230 | u64 start, limit, end; | ||
231 | int phys_blk; | ||
232 | |||
233 | phys_blk = emu_find_memblk_by_nid(i, pi); | ||
234 | if (phys_blk < 0) { | ||
235 | node_clear(i, physnode_mask); | ||
236 | continue; | ||
237 | } | ||
238 | start = pi->blk[phys_blk].start; | ||
239 | limit = pi->blk[phys_blk].end; | ||
240 | |||
241 | end = find_end_of_node(start, limit, size); | ||
242 | /* | ||
243 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
244 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
245 | * this one must extend to the boundary. | ||
246 | */ | ||
247 | if (end < dma32_end && dma32_end - end - | ||
248 | memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
249 | end = dma32_end; | ||
250 | |||
251 | /* | ||
252 | * If there won't be enough non-reserved memory for the | ||
253 | * next node, this one must extend to the end of the | ||
254 | * physical node. | ||
255 | */ | ||
256 | if (limit - end - | ||
257 | memblock_x86_hole_size(end, limit) < size) | ||
258 | end = limit; | ||
259 | |||
260 | ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, | ||
261 | phys_blk, | ||
262 | min(end, limit) - start); | ||
263 | if (ret < 0) | ||
264 | return ret; | ||
265 | } | ||
266 | } | ||
267 | return 0; | ||
268 | } | ||
269 | |||
270 | /** | ||
271 | * numa_emulation - Emulate NUMA nodes | ||
272 | * @numa_meminfo: NUMA configuration to massage | ||
273 | * @numa_dist_cnt: The size of the physical NUMA distance table | ||
274 | * | ||
275 | * Emulate NUMA nodes according to the numa=fake kernel parameter. | ||
276 | * @numa_meminfo contains the physical memory configuration and is modified | ||
277 | * to reflect the emulated configuration on success. @numa_dist_cnt is | ||
278 | * used to determine the size of the physical distance table. | ||
279 | * | ||
280 | * On success, the following modifications are made. | ||
281 | * | ||
282 | * - @numa_meminfo is updated to reflect the emulated nodes. | ||
283 | * | ||
284 | * - __apicid_to_node[] is updated such that APIC IDs are mapped to the | ||
285 | * emulated nodes. | ||
286 | * | ||
287 | * - NUMA distance table is rebuilt to represent distances between emulated | ||
288 | * nodes. The distances are determined considering how emulated nodes | ||
289 | * are mapped to physical nodes and match the actual distances. | ||
290 | * | ||
291 | * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical | ||
292 | * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). | ||
293 | * | ||
294 | * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with | ||
295 | * identity mapping and no other modification is made. | ||
296 | */ | ||
297 | void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) | ||
298 | { | ||
299 | static struct numa_meminfo ei __initdata; | ||
300 | static struct numa_meminfo pi __initdata; | ||
301 | const u64 max_addr = max_pfn << PAGE_SHIFT; | ||
302 | u8 *phys_dist = NULL; | ||
303 | size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); | ||
304 | int max_emu_nid, dfl_phys_nid; | ||
305 | int i, j, ret; | ||
306 | |||
307 | if (!emu_cmdline) | ||
308 | goto no_emu; | ||
309 | |||
310 | memset(&ei, 0, sizeof(ei)); | ||
311 | pi = *numa_meminfo; | ||
312 | |||
313 | for (i = 0; i < MAX_NUMNODES; i++) | ||
314 | emu_nid_to_phys[i] = NUMA_NO_NODE; | ||
315 | |||
316 | /* | ||
317 | * If the numa=fake command-line contains a 'M' or 'G', it represents | ||
318 | * the fixed node size. Otherwise, if it is just a single number N, | ||
319 | * split the system RAM into N fake nodes. | ||
320 | */ | ||
321 | if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { | ||
322 | u64 size; | ||
323 | |||
324 | size = memparse(emu_cmdline, &emu_cmdline); | ||
325 | ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); | ||
326 | } else { | ||
327 | unsigned long n; | ||
328 | |||
329 | n = simple_strtoul(emu_cmdline, NULL, 0); | ||
330 | ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); | ||
331 | } | ||
332 | |||
333 | if (ret < 0) | ||
334 | goto no_emu; | ||
335 | |||
336 | if (numa_cleanup_meminfo(&ei) < 0) { | ||
337 | pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); | ||
338 | goto no_emu; | ||
339 | } | ||
340 | |||
341 | /* copy the physical distance table */ | ||
342 | if (numa_dist_cnt) { | ||
343 | u64 phys; | ||
344 | |||
345 | phys = memblock_find_in_range(0, | ||
346 | (u64)max_pfn_mapped << PAGE_SHIFT, | ||
347 | phys_size, PAGE_SIZE); | ||
348 | if (phys == MEMBLOCK_ERROR) { | ||
349 | pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); | ||
350 | goto no_emu; | ||
351 | } | ||
352 | memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST"); | ||
353 | phys_dist = __va(phys); | ||
354 | |||
355 | for (i = 0; i < numa_dist_cnt; i++) | ||
356 | for (j = 0; j < numa_dist_cnt; j++) | ||
357 | phys_dist[i * numa_dist_cnt + j] = | ||
358 | node_distance(i, j); | ||
359 | } | ||
360 | |||
361 | /* | ||
362 | * Determine the max emulated nid and the default phys nid to use | ||
363 | * for unmapped nodes. | ||
364 | */ | ||
365 | max_emu_nid = 0; | ||
366 | dfl_phys_nid = NUMA_NO_NODE; | ||
367 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { | ||
368 | if (emu_nid_to_phys[i] != NUMA_NO_NODE) { | ||
369 | max_emu_nid = i; | ||
370 | if (dfl_phys_nid == NUMA_NO_NODE) | ||
371 | dfl_phys_nid = emu_nid_to_phys[i]; | ||
372 | } | ||
373 | } | ||
374 | if (dfl_phys_nid == NUMA_NO_NODE) { | ||
375 | pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); | ||
376 | goto no_emu; | ||
377 | } | ||
378 | |||
379 | /* commit */ | ||
380 | *numa_meminfo = ei; | ||
381 | |||
382 | /* | ||
383 | * Transform __apicid_to_node table to use emulated nids by | ||
384 | * reverse-mapping phys_nid. The maps should always exist but fall | ||
385 | * back to zero just in case. | ||
386 | */ | ||
387 | for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { | ||
388 | if (__apicid_to_node[i] == NUMA_NO_NODE) | ||
389 | continue; | ||
390 | for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) | ||
391 | if (__apicid_to_node[i] == emu_nid_to_phys[j]) | ||
392 | break; | ||
393 | __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; | ||
394 | } | ||
395 | |||
396 | /* make sure all emulated nodes are mapped to a physical node */ | ||
397 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) | ||
398 | if (emu_nid_to_phys[i] == NUMA_NO_NODE) | ||
399 | emu_nid_to_phys[i] = dfl_phys_nid; | ||
400 | |||
401 | /* transform distance table */ | ||
402 | numa_reset_distance(); | ||
403 | for (i = 0; i < max_emu_nid + 1; i++) { | ||
404 | for (j = 0; j < max_emu_nid + 1; j++) { | ||
405 | int physi = emu_nid_to_phys[i]; | ||
406 | int physj = emu_nid_to_phys[j]; | ||
407 | int dist; | ||
408 | |||
409 | if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) | ||
410 | dist = physi == physj ? | ||
411 | LOCAL_DISTANCE : REMOTE_DISTANCE; | ||
412 | else | ||
413 | dist = phys_dist[physi * numa_dist_cnt + physj]; | ||
414 | |||
415 | numa_set_distance(i, j, dist); | ||
416 | } | ||
417 | } | ||
418 | |||
419 | /* free the copied physical distance table */ | ||
420 | if (phys_dist) | ||
421 | memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size); | ||
422 | return; | ||
423 | |||
424 | no_emu: | ||
425 | /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ | ||
426 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) | ||
427 | emu_nid_to_phys[i] = i; | ||
428 | } | ||
429 | |||
430 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS | ||
431 | void __cpuinit numa_add_cpu(int cpu) | ||
432 | { | ||
433 | int physnid, nid; | ||
434 | |||
435 | nid = early_cpu_to_node(cpu); | ||
436 | BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); | ||
437 | |||
438 | physnid = emu_nid_to_phys[nid]; | ||
439 | |||
440 | /* | ||
441 | * Map the cpu to each emulated node that is allocated on the physical | ||
442 | * node of the cpu's apic id. | ||
443 | */ | ||
444 | for_each_online_node(nid) | ||
445 | if (emu_nid_to_phys[nid] == physnid) | ||
446 | cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); | ||
447 | } | ||
448 | |||
449 | void __cpuinit numa_remove_cpu(int cpu) | ||
450 | { | ||
451 | int i; | ||
452 | |||
453 | for_each_online_node(i) | ||
454 | cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); | ||
455 | } | ||
456 | #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ | ||
457 | static void __cpuinit numa_set_cpumask(int cpu, int enable) | ||
458 | { | ||
459 | struct cpumask *mask; | ||
460 | int nid, physnid, i; | ||
461 | |||
462 | nid = early_cpu_to_node(cpu); | ||
463 | if (nid == NUMA_NO_NODE) { | ||
464 | /* early_cpu_to_node() already emits a warning and trace */ | ||
465 | return; | ||
466 | } | ||
467 | |||
468 | physnid = emu_nid_to_phys[nid]; | ||
469 | |||
470 | for_each_online_node(i) { | ||
471 | if (emu_nid_to_phys[nid] != physnid) | ||
472 | continue; | ||
473 | |||
474 | mask = debug_cpumask_set_cpu(cpu, enable); | ||
475 | if (!mask) | ||
476 | return; | ||
477 | |||
478 | if (enable) | ||
479 | cpumask_set_cpu(cpu, mask); | ||
480 | else | ||
481 | cpumask_clear_cpu(cpu, mask); | ||
482 | } | ||
483 | } | ||
484 | |||
485 | void __cpuinit numa_add_cpu(int cpu) | ||
486 | { | ||
487 | numa_set_cpumask(cpu, 1); | ||
488 | } | ||
489 | |||
490 | void __cpuinit numa_remove_cpu(int cpu) | ||
491 | { | ||
492 | numa_set_cpumask(cpu, 0); | ||
493 | } | ||
494 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ | ||
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h new file mode 100644 index 000000000000..ef2d97377d7c --- /dev/null +++ b/arch/x86/mm/numa_internal.h | |||
@@ -0,0 +1,31 @@ | |||
1 | #ifndef __X86_MM_NUMA_INTERNAL_H | ||
2 | #define __X86_MM_NUMA_INTERNAL_H | ||
3 | |||
4 | #include <linux/types.h> | ||
5 | #include <asm/numa.h> | ||
6 | |||
7 | struct numa_memblk { | ||
8 | u64 start; | ||
9 | u64 end; | ||
10 | int nid; | ||
11 | }; | ||
12 | |||
13 | struct numa_meminfo { | ||
14 | int nr_blks; | ||
15 | struct numa_memblk blk[NR_NODE_MEMBLKS]; | ||
16 | }; | ||
17 | |||
18 | void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); | ||
19 | int __init numa_cleanup_meminfo(struct numa_meminfo *mi); | ||
20 | void __init numa_reset_distance(void); | ||
21 | |||
22 | #ifdef CONFIG_NUMA_EMU | ||
23 | void __init numa_emulation(struct numa_meminfo *numa_meminfo, | ||
24 | int numa_dist_cnt); | ||
25 | #else | ||
26 | static inline void numa_emulation(struct numa_meminfo *numa_meminfo, | ||
27 | int numa_dist_cnt) | ||
28 | { } | ||
29 | #endif | ||
30 | |||
31 | #endif /* __X86_MM_NUMA_INTERNAL_H */ | ||
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index ae96e7b8051d..48651c6f657d 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c | |||
@@ -57,7 +57,7 @@ struct node_memory_chunk_s { | |||
57 | static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; | 57 | static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; |
58 | 58 | ||
59 | static int __initdata num_memory_chunks; /* total number of memory chunks */ | 59 | static int __initdata num_memory_chunks; /* total number of memory chunks */ |
60 | static u8 __initdata apicid_to_pxm[MAX_APICID]; | 60 | static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC]; |
61 | 61 | ||
62 | int acpi_numa __initdata; | 62 | int acpi_numa __initdata; |
63 | 63 | ||
@@ -254,8 +254,8 @@ int __init get_memcfg_from_srat(void) | |||
254 | printk(KERN_DEBUG "Number of memory chunks in system = %d\n", | 254 | printk(KERN_DEBUG "Number of memory chunks in system = %d\n", |
255 | num_memory_chunks); | 255 | num_memory_chunks); |
256 | 256 | ||
257 | for (i = 0; i < MAX_APICID; i++) | 257 | for (i = 0; i < MAX_LOCAL_APIC; i++) |
258 | apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); | 258 | set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i])); |
259 | 259 | ||
260 | for (j = 0; j < num_memory_chunks; j++){ | 260 | for (j = 0; j < num_memory_chunks; j++){ |
261 | struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; | 261 | struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; |
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 603d285d1daa..8e9d3394f6d4 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c | |||
@@ -26,88 +26,34 @@ | |||
26 | 26 | ||
27 | int acpi_numa __initdata; | 27 | int acpi_numa __initdata; |
28 | 28 | ||
29 | static struct acpi_table_slit *acpi_slit; | ||
30 | |||
31 | static nodemask_t nodes_parsed __initdata; | ||
32 | static nodemask_t cpu_nodes_parsed __initdata; | ||
33 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | ||
34 | static struct bootnode nodes_add[MAX_NUMNODES]; | 29 | static struct bootnode nodes_add[MAX_NUMNODES]; |
35 | 30 | ||
36 | static int num_node_memblks __initdata; | ||
37 | static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; | ||
38 | static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; | ||
39 | |||
40 | static __init int setup_node(int pxm) | 31 | static __init int setup_node(int pxm) |
41 | { | 32 | { |
42 | return acpi_map_pxm_to_node(pxm); | 33 | return acpi_map_pxm_to_node(pxm); |
43 | } | 34 | } |
44 | 35 | ||
45 | static __init int conflicting_memblks(unsigned long start, unsigned long end) | ||
46 | { | ||
47 | int i; | ||
48 | for (i = 0; i < num_node_memblks; i++) { | ||
49 | struct bootnode *nd = &node_memblk_range[i]; | ||
50 | if (nd->start == nd->end) | ||
51 | continue; | ||
52 | if (nd->end > start && nd->start < end) | ||
53 | return memblk_nodeid[i]; | ||
54 | if (nd->end == end && nd->start == start) | ||
55 | return memblk_nodeid[i]; | ||
56 | } | ||
57 | return -1; | ||
58 | } | ||
59 | |||
60 | static __init void cutoff_node(int i, unsigned long start, unsigned long end) | ||
61 | { | ||
62 | struct bootnode *nd = &nodes[i]; | ||
63 | |||
64 | if (nd->start < start) { | ||
65 | nd->start = start; | ||
66 | if (nd->end < nd->start) | ||
67 | nd->start = nd->end; | ||
68 | } | ||
69 | if (nd->end > end) { | ||
70 | nd->end = end; | ||
71 | if (nd->start > nd->end) | ||
72 | nd->start = nd->end; | ||
73 | } | ||
74 | } | ||
75 | |||
76 | static __init void bad_srat(void) | 36 | static __init void bad_srat(void) |
77 | { | 37 | { |
78 | int i; | ||
79 | printk(KERN_ERR "SRAT: SRAT not used.\n"); | 38 | printk(KERN_ERR "SRAT: SRAT not used.\n"); |
80 | acpi_numa = -1; | 39 | acpi_numa = -1; |
81 | for (i = 0; i < MAX_LOCAL_APIC; i++) | 40 | memset(nodes_add, 0, sizeof(nodes_add)); |
82 | apicid_to_node[i] = NUMA_NO_NODE; | ||
83 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
84 | nodes[i].start = nodes[i].end = 0; | ||
85 | nodes_add[i].start = nodes_add[i].end = 0; | ||
86 | } | ||
87 | remove_all_active_ranges(); | ||
88 | } | 41 | } |
89 | 42 | ||
90 | static __init inline int srat_disabled(void) | 43 | static __init inline int srat_disabled(void) |
91 | { | 44 | { |
92 | return numa_off || acpi_numa < 0; | 45 | return acpi_numa < 0; |
93 | } | 46 | } |
94 | 47 | ||
95 | /* Callback for SLIT parsing */ | 48 | /* Callback for SLIT parsing */ |
96 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) | 49 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) |
97 | { | 50 | { |
98 | unsigned length; | 51 | int i, j; |
99 | unsigned long phys; | ||
100 | |||
101 | length = slit->header.length; | ||
102 | phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length, | ||
103 | PAGE_SIZE); | ||
104 | |||
105 | if (phys == MEMBLOCK_ERROR) | ||
106 | panic(" Can not save slit!\n"); | ||
107 | 52 | ||
108 | acpi_slit = __va(phys); | 53 | for (i = 0; i < slit->locality_count; i++) |
109 | memcpy(acpi_slit, slit, length); | 54 | for (j = 0; j < slit->locality_count; j++) |
110 | memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT"); | 55 | numa_set_distance(pxm_to_node(i), pxm_to_node(j), |
56 | slit->entry[slit->locality_count * i + j]); | ||
111 | } | 57 | } |
112 | 58 | ||
113 | /* Callback for Proximity Domain -> x2APIC mapping */ | 59 | /* Callback for Proximity Domain -> x2APIC mapping */ |
@@ -138,8 +84,8 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) | |||
138 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); | 84 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); |
139 | return; | 85 | return; |
140 | } | 86 | } |
141 | apicid_to_node[apic_id] = node; | 87 | set_apicid_to_node(apic_id, node); |
142 | node_set(node, cpu_nodes_parsed); | 88 | node_set(node, numa_nodes_parsed); |
143 | acpi_numa = 1; | 89 | acpi_numa = 1; |
144 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", | 90 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", |
145 | pxm, apic_id, node); | 91 | pxm, apic_id, node); |
@@ -178,8 +124,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) | |||
178 | return; | 124 | return; |
179 | } | 125 | } |
180 | 126 | ||
181 | apicid_to_node[apic_id] = node; | 127 | set_apicid_to_node(apic_id, node); |
182 | node_set(node, cpu_nodes_parsed); | 128 | node_set(node, numa_nodes_parsed); |
183 | acpi_numa = 1; | 129 | acpi_numa = 1; |
184 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", | 130 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", |
185 | pxm, apic_id, node); | 131 | pxm, apic_id, node); |
@@ -241,7 +187,7 @@ update_nodes_add(int node, unsigned long start, unsigned long end) | |||
241 | } | 187 | } |
242 | 188 | ||
243 | if (changed) { | 189 | if (changed) { |
244 | node_set(node, cpu_nodes_parsed); | 190 | node_set(node, numa_nodes_parsed); |
245 | printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", | 191 | printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", |
246 | nd->start, nd->end); | 192 | nd->start, nd->end); |
247 | } | 193 | } |
@@ -251,10 +197,8 @@ update_nodes_add(int node, unsigned long start, unsigned long end) | |||
251 | void __init | 197 | void __init |
252 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | 198 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) |
253 | { | 199 | { |
254 | struct bootnode *nd, oldnode; | ||
255 | unsigned long start, end; | 200 | unsigned long start, end; |
256 | int node, pxm; | 201 | int node, pxm; |
257 | int i; | ||
258 | 202 | ||
259 | if (srat_disabled()) | 203 | if (srat_disabled()) |
260 | return; | 204 | return; |
@@ -276,300 +220,31 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | |||
276 | bad_srat(); | 220 | bad_srat(); |
277 | return; | 221 | return; |
278 | } | 222 | } |
279 | i = conflicting_memblks(start, end); | 223 | |
280 | if (i == node) { | 224 | if (numa_add_memblk(node, start, end) < 0) { |
281 | printk(KERN_WARNING | ||
282 | "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", | ||
283 | pxm, start, end, nodes[i].start, nodes[i].end); | ||
284 | } else if (i >= 0) { | ||
285 | printk(KERN_ERR | ||
286 | "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", | ||
287 | pxm, start, end, node_to_pxm(i), | ||
288 | nodes[i].start, nodes[i].end); | ||
289 | bad_srat(); | 225 | bad_srat(); |
290 | return; | 226 | return; |
291 | } | 227 | } |
292 | nd = &nodes[node]; | ||
293 | oldnode = *nd; | ||
294 | if (!node_test_and_set(node, nodes_parsed)) { | ||
295 | nd->start = start; | ||
296 | nd->end = end; | ||
297 | } else { | ||
298 | if (start < nd->start) | ||
299 | nd->start = start; | ||
300 | if (nd->end < end) | ||
301 | nd->end = end; | ||
302 | } | ||
303 | 228 | ||
304 | printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, | 229 | printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, |
305 | start, end); | 230 | start, end); |
306 | 231 | ||
307 | if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { | 232 | if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) |
308 | update_nodes_add(node, start, end); | 233 | update_nodes_add(node, start, end); |
309 | /* restore nodes[node] */ | ||
310 | *nd = oldnode; | ||
311 | if ((nd->start | nd->end) == 0) | ||
312 | node_clear(node, nodes_parsed); | ||
313 | } | ||
314 | |||
315 | node_memblk_range[num_node_memblks].start = start; | ||
316 | node_memblk_range[num_node_memblks].end = end; | ||
317 | memblk_nodeid[num_node_memblks] = node; | ||
318 | num_node_memblks++; | ||
319 | } | ||
320 | |||
321 | /* Sanity check to catch more bad SRATs (they are amazingly common). | ||
322 | Make sure the PXMs cover all memory. */ | ||
323 | static int __init nodes_cover_memory(const struct bootnode *nodes) | ||
324 | { | ||
325 | int i; | ||
326 | unsigned long pxmram, e820ram; | ||
327 | |||
328 | pxmram = 0; | ||
329 | for_each_node_mask(i, nodes_parsed) { | ||
330 | unsigned long s = nodes[i].start >> PAGE_SHIFT; | ||
331 | unsigned long e = nodes[i].end >> PAGE_SHIFT; | ||
332 | pxmram += e - s; | ||
333 | pxmram -= __absent_pages_in_range(i, s, e); | ||
334 | if ((long)pxmram < 0) | ||
335 | pxmram = 0; | ||
336 | } | ||
337 | |||
338 | e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT); | ||
339 | /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ | ||
340 | if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) { | ||
341 | printk(KERN_ERR | ||
342 | "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", | ||
343 | (pxmram << PAGE_SHIFT) >> 20, | ||
344 | (e820ram << PAGE_SHIFT) >> 20); | ||
345 | return 0; | ||
346 | } | ||
347 | return 1; | ||
348 | } | 234 | } |
349 | 235 | ||
350 | void __init acpi_numa_arch_fixup(void) {} | 236 | void __init acpi_numa_arch_fixup(void) {} |
351 | 237 | ||
352 | #ifdef CONFIG_NUMA_EMU | 238 | int __init x86_acpi_numa_init(void) |
353 | void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start, | ||
354 | unsigned long end) | ||
355 | { | ||
356 | int i; | ||
357 | |||
358 | for_each_node_mask(i, nodes_parsed) { | ||
359 | cutoff_node(i, start, end); | ||
360 | physnodes[i].start = nodes[i].start; | ||
361 | physnodes[i].end = nodes[i].end; | ||
362 | } | ||
363 | } | ||
364 | #endif /* CONFIG_NUMA_EMU */ | ||
365 | |||
366 | /* Use the information discovered above to actually set up the nodes. */ | ||
367 | int __init acpi_scan_nodes(unsigned long start, unsigned long end) | ||
368 | { | 239 | { |
369 | int i; | 240 | int ret; |
370 | |||
371 | if (acpi_numa <= 0) | ||
372 | return -1; | ||
373 | |||
374 | /* First clean up the node list */ | ||
375 | for (i = 0; i < MAX_NUMNODES; i++) | ||
376 | cutoff_node(i, start, end); | ||
377 | |||
378 | /* | ||
379 | * Join together blocks on the same node, holes between | ||
380 | * which don't overlap with memory on other nodes. | ||
381 | */ | ||
382 | for (i = 0; i < num_node_memblks; ++i) { | ||
383 | int j, k; | ||
384 | |||
385 | for (j = i + 1; j < num_node_memblks; ++j) { | ||
386 | unsigned long start, end; | ||
387 | |||
388 | if (memblk_nodeid[i] != memblk_nodeid[j]) | ||
389 | continue; | ||
390 | start = min(node_memblk_range[i].end, | ||
391 | node_memblk_range[j].end); | ||
392 | end = max(node_memblk_range[i].start, | ||
393 | node_memblk_range[j].start); | ||
394 | for (k = 0; k < num_node_memblks; ++k) { | ||
395 | if (memblk_nodeid[i] == memblk_nodeid[k]) | ||
396 | continue; | ||
397 | if (start < node_memblk_range[k].end && | ||
398 | end > node_memblk_range[k].start) | ||
399 | break; | ||
400 | } | ||
401 | if (k < num_node_memblks) | ||
402 | continue; | ||
403 | start = min(node_memblk_range[i].start, | ||
404 | node_memblk_range[j].start); | ||
405 | end = max(node_memblk_range[i].end, | ||
406 | node_memblk_range[j].end); | ||
407 | printk(KERN_INFO "SRAT: Node %d " | ||
408 | "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", | ||
409 | memblk_nodeid[i], | ||
410 | node_memblk_range[i].start, | ||
411 | node_memblk_range[i].end, | ||
412 | node_memblk_range[j].start, | ||
413 | node_memblk_range[j].end, | ||
414 | start, end); | ||
415 | node_memblk_range[i].start = start; | ||
416 | node_memblk_range[i].end = end; | ||
417 | k = --num_node_memblks - j; | ||
418 | memmove(memblk_nodeid + j, memblk_nodeid + j+1, | ||
419 | k * sizeof(*memblk_nodeid)); | ||
420 | memmove(node_memblk_range + j, node_memblk_range + j+1, | ||
421 | k * sizeof(*node_memblk_range)); | ||
422 | --j; | ||
423 | } | ||
424 | } | ||
425 | |||
426 | memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, | ||
427 | memblk_nodeid); | ||
428 | if (memnode_shift < 0) { | ||
429 | printk(KERN_ERR | ||
430 | "SRAT: No NUMA node hash function found. Contact maintainer\n"); | ||
431 | bad_srat(); | ||
432 | return -1; | ||
433 | } | ||
434 | |||
435 | for (i = 0; i < num_node_memblks; i++) | ||
436 | memblock_x86_register_active_regions(memblk_nodeid[i], | ||
437 | node_memblk_range[i].start >> PAGE_SHIFT, | ||
438 | node_memblk_range[i].end >> PAGE_SHIFT); | ||
439 | |||
440 | /* for out of order entries in SRAT */ | ||
441 | sort_node_map(); | ||
442 | if (!nodes_cover_memory(nodes)) { | ||
443 | bad_srat(); | ||
444 | return -1; | ||
445 | } | ||
446 | 241 | ||
447 | /* Account for nodes with cpus and no memory */ | 242 | ret = acpi_numa_init(); |
448 | nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); | 243 | if (ret < 0) |
449 | 244 | return ret; | |
450 | /* Finally register nodes */ | 245 | return srat_disabled() ? -EINVAL : 0; |
451 | for_each_node_mask(i, node_possible_map) | ||
452 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
453 | /* Try again in case setup_node_bootmem missed one due | ||
454 | to missing bootmem */ | ||
455 | for_each_node_mask(i, node_possible_map) | ||
456 | if (!node_online(i)) | ||
457 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
458 | |||
459 | for (i = 0; i < nr_cpu_ids; i++) { | ||
460 | int node = early_cpu_to_node(i); | ||
461 | |||
462 | if (node == NUMA_NO_NODE) | ||
463 | continue; | ||
464 | if (!node_online(node)) | ||
465 | numa_clear_node(i); | ||
466 | } | ||
467 | numa_init_array(); | ||
468 | return 0; | ||
469 | } | ||
470 | |||
471 | #ifdef CONFIG_NUMA_EMU | ||
472 | static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = { | ||
473 | [0 ... MAX_NUMNODES-1] = PXM_INVAL | ||
474 | }; | ||
475 | static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { | ||
476 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
477 | }; | ||
478 | static int __init find_node_by_addr(unsigned long addr) | ||
479 | { | ||
480 | int ret = NUMA_NO_NODE; | ||
481 | int i; | ||
482 | |||
483 | for_each_node_mask(i, nodes_parsed) { | ||
484 | /* | ||
485 | * Find the real node that this emulated node appears on. For | ||
486 | * the sake of simplicity, we only use a real node's starting | ||
487 | * address to determine which emulated node it appears on. | ||
488 | */ | ||
489 | if (addr >= nodes[i].start && addr < nodes[i].end) { | ||
490 | ret = i; | ||
491 | break; | ||
492 | } | ||
493 | } | ||
494 | return ret; | ||
495 | } | 246 | } |
496 | 247 | ||
497 | /* | ||
498 | * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID | ||
499 | * mappings that respect the real ACPI topology but reflect our emulated | ||
500 | * environment. For each emulated node, we find which real node it appears on | ||
501 | * and create PXM to NID mappings for those fake nodes which mirror that | ||
502 | * locality. SLIT will now represent the correct distances between emulated | ||
503 | * nodes as a result of the real topology. | ||
504 | */ | ||
505 | void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) | ||
506 | { | ||
507 | int i, j; | ||
508 | |||
509 | for (i = 0; i < num_nodes; i++) { | ||
510 | int nid, pxm; | ||
511 | |||
512 | nid = find_node_by_addr(fake_nodes[i].start); | ||
513 | if (nid == NUMA_NO_NODE) | ||
514 | continue; | ||
515 | pxm = node_to_pxm(nid); | ||
516 | if (pxm == PXM_INVAL) | ||
517 | continue; | ||
518 | fake_node_to_pxm_map[i] = pxm; | ||
519 | /* | ||
520 | * For each apicid_to_node mapping that exists for this real | ||
521 | * node, it must now point to the fake node ID. | ||
522 | */ | ||
523 | for (j = 0; j < MAX_LOCAL_APIC; j++) | ||
524 | if (apicid_to_node[j] == nid && | ||
525 | fake_apicid_to_node[j] == NUMA_NO_NODE) | ||
526 | fake_apicid_to_node[j] = i; | ||
527 | } | ||
528 | |||
529 | /* | ||
530 | * If there are apicid-to-node mappings for physical nodes that do not | ||
531 | * have a corresponding emulated node, it should default to a guaranteed | ||
532 | * value. | ||
533 | */ | ||
534 | for (i = 0; i < MAX_LOCAL_APIC; i++) | ||
535 | if (apicid_to_node[i] != NUMA_NO_NODE && | ||
536 | fake_apicid_to_node[i] == NUMA_NO_NODE) | ||
537 | fake_apicid_to_node[i] = 0; | ||
538 | |||
539 | for (i = 0; i < num_nodes; i++) | ||
540 | __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); | ||
541 | memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); | ||
542 | |||
543 | nodes_clear(nodes_parsed); | ||
544 | for (i = 0; i < num_nodes; i++) | ||
545 | if (fake_nodes[i].start != fake_nodes[i].end) | ||
546 | node_set(i, nodes_parsed); | ||
547 | } | ||
548 | |||
549 | static int null_slit_node_compare(int a, int b) | ||
550 | { | ||
551 | return node_to_pxm(a) == node_to_pxm(b); | ||
552 | } | ||
553 | #else | ||
554 | static int null_slit_node_compare(int a, int b) | ||
555 | { | ||
556 | return a == b; | ||
557 | } | ||
558 | #endif /* CONFIG_NUMA_EMU */ | ||
559 | |||
560 | int __node_distance(int a, int b) | ||
561 | { | ||
562 | int index; | ||
563 | |||
564 | if (!acpi_slit) | ||
565 | return null_slit_node_compare(a, b) ? LOCAL_DISTANCE : | ||
566 | REMOTE_DISTANCE; | ||
567 | index = acpi_slit->locality_count * node_to_pxm(a); | ||
568 | return acpi_slit->entry[index + node_to_pxm(b)]; | ||
569 | } | ||
570 | |||
571 | EXPORT_SYMBOL(__node_distance); | ||
572 | |||
573 | #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) | 248 | #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) |
574 | int memory_add_physaddr_to_nid(u64 start) | 249 | int memory_add_physaddr_to_nid(u64 start) |
575 | { | 250 | { |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6acc724d5d8f..d6c0418c3e47 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -179,12 +179,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, | |||
179 | sender = this_cpu_read(tlb_vector_offset); | 179 | sender = this_cpu_read(tlb_vector_offset); |
180 | f = &flush_state[sender]; | 180 | f = &flush_state[sender]; |
181 | 181 | ||
182 | /* | 182 | if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) |
183 | * Could avoid this lock when | 183 | raw_spin_lock(&f->tlbstate_lock); |
184 | * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is | ||
185 | * probably not worth checking this for a cache-hot lock. | ||
186 | */ | ||
187 | raw_spin_lock(&f->tlbstate_lock); | ||
188 | 184 | ||
189 | f->flush_mm = mm; | 185 | f->flush_mm = mm; |
190 | f->flush_va = va; | 186 | f->flush_va = va; |
@@ -202,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, | |||
202 | 198 | ||
203 | f->flush_mm = NULL; | 199 | f->flush_mm = NULL; |
204 | f->flush_va = 0; | 200 | f->flush_va = 0; |
205 | raw_spin_unlock(&f->tlbstate_lock); | 201 | if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) |
202 | raw_spin_unlock(&f->tlbstate_lock); | ||
206 | } | 203 | } |
207 | 204 | ||
208 | void native_flush_tlb_others(const struct cpumask *cpumask, | 205 | void native_flush_tlb_others(const struct cpumask *cpumask, |
@@ -211,11 +208,10 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
211 | if (is_uv_system()) { | 208 | if (is_uv_system()) { |
212 | unsigned int cpu; | 209 | unsigned int cpu; |
213 | 210 | ||
214 | cpu = get_cpu(); | 211 | cpu = smp_processor_id(); |
215 | cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); | 212 | cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); |
216 | if (cpumask) | 213 | if (cpumask) |
217 | flush_tlb_others_ipi(cpumask, mm, va); | 214 | flush_tlb_others_ipi(cpumask, mm, va); |
218 | put_cpu(); | ||
219 | return; | 215 | return; |
220 | } | 216 | } |
221 | flush_tlb_others_ipi(cpumask, mm, va); | 217 | flush_tlb_others_ipi(cpumask, mm, va); |
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index e27dffbbb1a7..026e4931d162 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c | |||
@@ -350,7 +350,7 @@ static int __init early_fill_mp_bus_info(void) | |||
350 | 350 | ||
351 | #define ENABLE_CF8_EXT_CFG (1ULL << 46) | 351 | #define ENABLE_CF8_EXT_CFG (1ULL << 46) |
352 | 352 | ||
353 | static void enable_pci_io_ecs(void *unused) | 353 | static void __cpuinit enable_pci_io_ecs(void *unused) |
354 | { | 354 | { |
355 | u64 reg; | 355 | u64 reg; |
356 | rdmsrl(MSR_AMD64_NB_CFG, reg); | 356 | rdmsrl(MSR_AMD64_NB_CFG, reg); |
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 832765c0fb8c..3f6f3347aa17 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -1491,7 +1491,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) | |||
1491 | * early_ioremap fixmap slot, make sure it is RO. | 1491 | * early_ioremap fixmap slot, make sure it is RO. |
1492 | */ | 1492 | */ |
1493 | if (!is_early_ioremap_ptep(ptep) && | 1493 | if (!is_early_ioremap_ptep(ptep) && |
1494 | pfn >= e820_table_start && pfn < e820_table_end) | 1494 | pfn >= pgt_buf_start && pfn < pgt_buf_end) |
1495 | pte = pte_wrprotect(pte); | 1495 | pte = pte_wrprotect(pte); |
1496 | 1496 | ||
1497 | return pte; | 1497 | return pte; |
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 5eb25eb3ea48..3b5c3189fd99 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c | |||
@@ -274,7 +274,7 @@ acpi_table_parse_srat(enum acpi_srat_type id, | |||
274 | 274 | ||
275 | int __init acpi_numa_init(void) | 275 | int __init acpi_numa_init(void) |
276 | { | 276 | { |
277 | int ret = 0; | 277 | int cnt = 0; |
278 | 278 | ||
279 | /* | 279 | /* |
280 | * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= | 280 | * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= |
@@ -288,7 +288,7 @@ int __init acpi_numa_init(void) | |||
288 | acpi_parse_x2apic_affinity, 0); | 288 | acpi_parse_x2apic_affinity, 0); |
289 | acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, | 289 | acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, |
290 | acpi_parse_processor_affinity, 0); | 290 | acpi_parse_processor_affinity, 0); |
291 | ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, | 291 | cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, |
292 | acpi_parse_memory_affinity, | 292 | acpi_parse_memory_affinity, |
293 | NR_NODE_MEMBLKS); | 293 | NR_NODE_MEMBLKS); |
294 | } | 294 | } |
@@ -297,7 +297,10 @@ int __init acpi_numa_init(void) | |||
297 | acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); | 297 | acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); |
298 | 298 | ||
299 | acpi_numa_arch_fixup(); | 299 | acpi_numa_arch_fixup(); |
300 | return ret; | 300 | |
301 | if (cnt <= 0) | ||
302 | return cnt ?: -ENOENT; | ||
303 | return 0; | ||
301 | } | 304 | } |
302 | 305 | ||
303 | int acpi_get_pxm(acpi_handle h) | 306 | int acpi_get_pxm(acpi_handle h) |
diff --git a/include/linux/mm.h b/include/linux/mm.h index f6385fc17ad4..679300c050f5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -1309,8 +1309,6 @@ int add_from_early_node_map(struct range *range, int az, | |||
1309 | int nr_range, int nid); | 1309 | int nr_range, int nid); |
1310 | u64 __init find_memory_core_early(int nid, u64 size, u64 align, | 1310 | u64 __init find_memory_core_early(int nid, u64 size, u64 align, |
1311 | u64 goal, u64 limit); | 1311 | u64 goal, u64 limit); |
1312 | void *__alloc_memory_core_early(int nodeid, u64 size, u64 align, | ||
1313 | u64 goal, u64 limit); | ||
1314 | typedef int (*work_fn_t)(unsigned long, unsigned long, void *); | 1312 | typedef int (*work_fn_t)(unsigned long, unsigned long, void *); |
1315 | extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); | 1313 | extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); |
1316 | extern void sparse_memory_present_with_active_regions(int nid); | 1314 | extern void sparse_memory_present_with_active_regions(int nid); |
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 3adb06ebf841..580de67f318b 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h | |||
@@ -518,6 +518,7 @@ | |||
518 | #define PCI_DEVICE_ID_AMD_11H_NB_MISC 0x1303 | 518 | #define PCI_DEVICE_ID_AMD_11H_NB_MISC 0x1303 |
519 | #define PCI_DEVICE_ID_AMD_11H_NB_LINK 0x1304 | 519 | #define PCI_DEVICE_ID_AMD_11H_NB_LINK 0x1304 |
520 | #define PCI_DEVICE_ID_AMD_15H_NB_MISC 0x1603 | 520 | #define PCI_DEVICE_ID_AMD_15H_NB_MISC 0x1603 |
521 | #define PCI_DEVICE_ID_AMD_15H_NB_LINK 0x1604 | ||
521 | #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 | 522 | #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 |
522 | #define PCI_DEVICE_ID_AMD_LANCE 0x2000 | 523 | #define PCI_DEVICE_ID_AMD_LANCE 0x2000 |
523 | #define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 | 524 | #define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 |
diff --git a/mm/Makefile b/mm/Makefile index 2b1b575ae712..42a8326c3e3d 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -7,7 +7,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | |||
7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ |
8 | vmalloc.o pagewalk.o pgtable-generic.o | 8 | vmalloc.o pagewalk.o pgtable-generic.o |
9 | 9 | ||
10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | 10 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ |
11 | maccess.o page_alloc.o page-writeback.o \ | 11 | maccess.o page_alloc.o page-writeback.o \ |
12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
@@ -15,6 +15,12 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | |||
15 | $(mmu-y) | 15 | $(mmu-y) |
16 | obj-y += init-mm.o | 16 | obj-y += init-mm.o |
17 | 17 | ||
18 | ifdef CONFIG_NO_BOOTMEM | ||
19 | obj-y += nobootmem.o | ||
20 | else | ||
21 | obj-y += bootmem.o | ||
22 | endif | ||
23 | |||
18 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | 24 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o |
19 | 25 | ||
20 | obj-$(CONFIG_BOUNCE) += bounce.o | 26 | obj-$(CONFIG_BOUNCE) += bounce.o |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 13b0caa9793c..07aeb89e396e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -23,6 +23,13 @@ | |||
23 | 23 | ||
24 | #include "internal.h" | 24 | #include "internal.h" |
25 | 25 | ||
26 | #ifndef CONFIG_NEED_MULTIPLE_NODES | ||
27 | struct pglist_data __refdata contig_page_data = { | ||
28 | .bdata = &bootmem_node_data[0] | ||
29 | }; | ||
30 | EXPORT_SYMBOL(contig_page_data); | ||
31 | #endif | ||
32 | |||
26 | unsigned long max_low_pfn; | 33 | unsigned long max_low_pfn; |
27 | unsigned long min_low_pfn; | 34 | unsigned long min_low_pfn; |
28 | unsigned long max_pfn; | 35 | unsigned long max_pfn; |
@@ -35,7 +42,6 @@ unsigned long max_pfn; | |||
35 | unsigned long saved_max_pfn; | 42 | unsigned long saved_max_pfn; |
36 | #endif | 43 | #endif |
37 | 44 | ||
38 | #ifndef CONFIG_NO_BOOTMEM | ||
39 | bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; | 45 | bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; |
40 | 46 | ||
41 | static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); | 47 | static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); |
@@ -146,7 +152,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | |||
146 | min_low_pfn = start; | 152 | min_low_pfn = start; |
147 | return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); | 153 | return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); |
148 | } | 154 | } |
149 | #endif | 155 | |
150 | /* | 156 | /* |
151 | * free_bootmem_late - free bootmem pages directly to page allocator | 157 | * free_bootmem_late - free bootmem pages directly to page allocator |
152 | * @addr: starting address of the range | 158 | * @addr: starting address of the range |
@@ -171,53 +177,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) | |||
171 | } | 177 | } |
172 | } | 178 | } |
173 | 179 | ||
174 | #ifdef CONFIG_NO_BOOTMEM | ||
175 | static void __init __free_pages_memory(unsigned long start, unsigned long end) | ||
176 | { | ||
177 | int i; | ||
178 | unsigned long start_aligned, end_aligned; | ||
179 | int order = ilog2(BITS_PER_LONG); | ||
180 | |||
181 | start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); | ||
182 | end_aligned = end & ~(BITS_PER_LONG - 1); | ||
183 | |||
184 | if (end_aligned <= start_aligned) { | ||
185 | for (i = start; i < end; i++) | ||
186 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
187 | |||
188 | return; | ||
189 | } | ||
190 | |||
191 | for (i = start; i < start_aligned; i++) | ||
192 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
193 | |||
194 | for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) | ||
195 | __free_pages_bootmem(pfn_to_page(i), order); | ||
196 | |||
197 | for (i = end_aligned; i < end; i++) | ||
198 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
199 | } | ||
200 | |||
201 | unsigned long __init free_all_memory_core_early(int nodeid) | ||
202 | { | ||
203 | int i; | ||
204 | u64 start, end; | ||
205 | unsigned long count = 0; | ||
206 | struct range *range = NULL; | ||
207 | int nr_range; | ||
208 | |||
209 | nr_range = get_free_all_memory_range(&range, nodeid); | ||
210 | |||
211 | for (i = 0; i < nr_range; i++) { | ||
212 | start = range[i].start; | ||
213 | end = range[i].end; | ||
214 | count += end - start; | ||
215 | __free_pages_memory(start, end); | ||
216 | } | ||
217 | |||
218 | return count; | ||
219 | } | ||
220 | #else | ||
221 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | 180 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) |
222 | { | 181 | { |
223 | int aligned; | 182 | int aligned; |
@@ -278,7 +237,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
278 | 237 | ||
279 | return count; | 238 | return count; |
280 | } | 239 | } |
281 | #endif | ||
282 | 240 | ||
283 | /** | 241 | /** |
284 | * free_all_bootmem_node - release a node's free pages to the buddy allocator | 242 | * free_all_bootmem_node - release a node's free pages to the buddy allocator |
@@ -289,12 +247,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
289 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | 247 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) |
290 | { | 248 | { |
291 | register_page_bootmem_info_node(pgdat); | 249 | register_page_bootmem_info_node(pgdat); |
292 | #ifdef CONFIG_NO_BOOTMEM | ||
293 | /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ | ||
294 | return 0; | ||
295 | #else | ||
296 | return free_all_bootmem_core(pgdat->bdata); | 250 | return free_all_bootmem_core(pgdat->bdata); |
297 | #endif | ||
298 | } | 251 | } |
299 | 252 | ||
300 | /** | 253 | /** |
@@ -304,16 +257,6 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | |||
304 | */ | 257 | */ |
305 | unsigned long __init free_all_bootmem(void) | 258 | unsigned long __init free_all_bootmem(void) |
306 | { | 259 | { |
307 | #ifdef CONFIG_NO_BOOTMEM | ||
308 | /* | ||
309 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | ||
310 | * because in some case like Node0 doesnt have RAM installed | ||
311 | * low ram will be on Node1 | ||
312 | * Use MAX_NUMNODES will make sure all ranges in early_node_map[] | ||
313 | * will be used instead of only Node0 related | ||
314 | */ | ||
315 | return free_all_memory_core_early(MAX_NUMNODES); | ||
316 | #else | ||
317 | unsigned long total_pages = 0; | 260 | unsigned long total_pages = 0; |
318 | bootmem_data_t *bdata; | 261 | bootmem_data_t *bdata; |
319 | 262 | ||
@@ -321,10 +264,8 @@ unsigned long __init free_all_bootmem(void) | |||
321 | total_pages += free_all_bootmem_core(bdata); | 264 | total_pages += free_all_bootmem_core(bdata); |
322 | 265 | ||
323 | return total_pages; | 266 | return total_pages; |
324 | #endif | ||
325 | } | 267 | } |
326 | 268 | ||
327 | #ifndef CONFIG_NO_BOOTMEM | ||
328 | static void __init __free(bootmem_data_t *bdata, | 269 | static void __init __free(bootmem_data_t *bdata, |
329 | unsigned long sidx, unsigned long eidx) | 270 | unsigned long sidx, unsigned long eidx) |
330 | { | 271 | { |
@@ -419,7 +360,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, | |||
419 | } | 360 | } |
420 | BUG(); | 361 | BUG(); |
421 | } | 362 | } |
422 | #endif | ||
423 | 363 | ||
424 | /** | 364 | /** |
425 | * free_bootmem_node - mark a page range as usable | 365 | * free_bootmem_node - mark a page range as usable |
@@ -434,10 +374,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, | |||
434 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 374 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, |
435 | unsigned long size) | 375 | unsigned long size) |
436 | { | 376 | { |
437 | #ifdef CONFIG_NO_BOOTMEM | ||
438 | kmemleak_free_part(__va(physaddr), size); | ||
439 | memblock_x86_free_range(physaddr, physaddr + size); | ||
440 | #else | ||
441 | unsigned long start, end; | 377 | unsigned long start, end; |
442 | 378 | ||
443 | kmemleak_free_part(__va(physaddr), size); | 379 | kmemleak_free_part(__va(physaddr), size); |
@@ -446,7 +382,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
446 | end = PFN_DOWN(physaddr + size); | 382 | end = PFN_DOWN(physaddr + size); |
447 | 383 | ||
448 | mark_bootmem_node(pgdat->bdata, start, end, 0, 0); | 384 | mark_bootmem_node(pgdat->bdata, start, end, 0, 0); |
449 | #endif | ||
450 | } | 385 | } |
451 | 386 | ||
452 | /** | 387 | /** |
@@ -460,10 +395,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
460 | */ | 395 | */ |
461 | void __init free_bootmem(unsigned long addr, unsigned long size) | 396 | void __init free_bootmem(unsigned long addr, unsigned long size) |
462 | { | 397 | { |
463 | #ifdef CONFIG_NO_BOOTMEM | ||
464 | kmemleak_free_part(__va(addr), size); | ||
465 | memblock_x86_free_range(addr, addr + size); | ||
466 | #else | ||
467 | unsigned long start, end; | 398 | unsigned long start, end; |
468 | 399 | ||
469 | kmemleak_free_part(__va(addr), size); | 400 | kmemleak_free_part(__va(addr), size); |
@@ -472,7 +403,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size) | |||
472 | end = PFN_DOWN(addr + size); | 403 | end = PFN_DOWN(addr + size); |
473 | 404 | ||
474 | mark_bootmem(start, end, 0, 0); | 405 | mark_bootmem(start, end, 0, 0); |
475 | #endif | ||
476 | } | 406 | } |
477 | 407 | ||
478 | /** | 408 | /** |
@@ -489,17 +419,12 @@ void __init free_bootmem(unsigned long addr, unsigned long size) | |||
489 | int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 419 | int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, |
490 | unsigned long size, int flags) | 420 | unsigned long size, int flags) |
491 | { | 421 | { |
492 | #ifdef CONFIG_NO_BOOTMEM | ||
493 | panic("no bootmem"); | ||
494 | return 0; | ||
495 | #else | ||
496 | unsigned long start, end; | 422 | unsigned long start, end; |
497 | 423 | ||
498 | start = PFN_DOWN(physaddr); | 424 | start = PFN_DOWN(physaddr); |
499 | end = PFN_UP(physaddr + size); | 425 | end = PFN_UP(physaddr + size); |
500 | 426 | ||
501 | return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); | 427 | return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); |
502 | #endif | ||
503 | } | 428 | } |
504 | 429 | ||
505 | /** | 430 | /** |
@@ -515,20 +440,14 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
515 | int __init reserve_bootmem(unsigned long addr, unsigned long size, | 440 | int __init reserve_bootmem(unsigned long addr, unsigned long size, |
516 | int flags) | 441 | int flags) |
517 | { | 442 | { |
518 | #ifdef CONFIG_NO_BOOTMEM | ||
519 | panic("no bootmem"); | ||
520 | return 0; | ||
521 | #else | ||
522 | unsigned long start, end; | 443 | unsigned long start, end; |
523 | 444 | ||
524 | start = PFN_DOWN(addr); | 445 | start = PFN_DOWN(addr); |
525 | end = PFN_UP(addr + size); | 446 | end = PFN_UP(addr + size); |
526 | 447 | ||
527 | return mark_bootmem(start, end, 1, flags); | 448 | return mark_bootmem(start, end, 1, flags); |
528 | #endif | ||
529 | } | 449 | } |
530 | 450 | ||
531 | #ifndef CONFIG_NO_BOOTMEM | ||
532 | int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | 451 | int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, |
533 | int flags) | 452 | int flags) |
534 | { | 453 | { |
@@ -685,33 +604,12 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, | |||
685 | #endif | 604 | #endif |
686 | return NULL; | 605 | return NULL; |
687 | } | 606 | } |
688 | #endif | ||
689 | 607 | ||
690 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, | 608 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, |
691 | unsigned long align, | 609 | unsigned long align, |
692 | unsigned long goal, | 610 | unsigned long goal, |
693 | unsigned long limit) | 611 | unsigned long limit) |
694 | { | 612 | { |
695 | #ifdef CONFIG_NO_BOOTMEM | ||
696 | void *ptr; | ||
697 | |||
698 | if (WARN_ON_ONCE(slab_is_available())) | ||
699 | return kzalloc(size, GFP_NOWAIT); | ||
700 | |||
701 | restart: | ||
702 | |||
703 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); | ||
704 | |||
705 | if (ptr) | ||
706 | return ptr; | ||
707 | |||
708 | if (goal != 0) { | ||
709 | goal = 0; | ||
710 | goto restart; | ||
711 | } | ||
712 | |||
713 | return NULL; | ||
714 | #else | ||
715 | bootmem_data_t *bdata; | 613 | bootmem_data_t *bdata; |
716 | void *region; | 614 | void *region; |
717 | 615 | ||
@@ -737,7 +635,6 @@ restart: | |||
737 | } | 635 | } |
738 | 636 | ||
739 | return NULL; | 637 | return NULL; |
740 | #endif | ||
741 | } | 638 | } |
742 | 639 | ||
743 | /** | 640 | /** |
@@ -758,10 +655,6 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, | |||
758 | { | 655 | { |
759 | unsigned long limit = 0; | 656 | unsigned long limit = 0; |
760 | 657 | ||
761 | #ifdef CONFIG_NO_BOOTMEM | ||
762 | limit = -1UL; | ||
763 | #endif | ||
764 | |||
765 | return ___alloc_bootmem_nopanic(size, align, goal, limit); | 658 | return ___alloc_bootmem_nopanic(size, align, goal, limit); |
766 | } | 659 | } |
767 | 660 | ||
@@ -798,14 +691,9 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, | |||
798 | { | 691 | { |
799 | unsigned long limit = 0; | 692 | unsigned long limit = 0; |
800 | 693 | ||
801 | #ifdef CONFIG_NO_BOOTMEM | ||
802 | limit = -1UL; | ||
803 | #endif | ||
804 | |||
805 | return ___alloc_bootmem(size, align, goal, limit); | 694 | return ___alloc_bootmem(size, align, goal, limit); |
806 | } | 695 | } |
807 | 696 | ||
808 | #ifndef CONFIG_NO_BOOTMEM | ||
809 | static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | 697 | static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, |
810 | unsigned long size, unsigned long align, | 698 | unsigned long size, unsigned long align, |
811 | unsigned long goal, unsigned long limit) | 699 | unsigned long goal, unsigned long limit) |
@@ -822,7 +710,6 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | |||
822 | 710 | ||
823 | return ___alloc_bootmem(size, align, goal, limit); | 711 | return ___alloc_bootmem(size, align, goal, limit); |
824 | } | 712 | } |
825 | #endif | ||
826 | 713 | ||
827 | /** | 714 | /** |
828 | * __alloc_bootmem_node - allocate boot memory from a specific node | 715 | * __alloc_bootmem_node - allocate boot memory from a specific node |
@@ -842,24 +729,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | |||
842 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | 729 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, |
843 | unsigned long align, unsigned long goal) | 730 | unsigned long align, unsigned long goal) |
844 | { | 731 | { |
845 | void *ptr; | ||
846 | |||
847 | if (WARN_ON_ONCE(slab_is_available())) | 732 | if (WARN_ON_ONCE(slab_is_available())) |
848 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 733 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
849 | 734 | ||
850 | #ifdef CONFIG_NO_BOOTMEM | 735 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); |
851 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
852 | goal, -1ULL); | ||
853 | if (ptr) | ||
854 | return ptr; | ||
855 | |||
856 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, | ||
857 | goal, -1ULL); | ||
858 | #else | ||
859 | ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); | ||
860 | #endif | ||
861 | |||
862 | return ptr; | ||
863 | } | 736 | } |
864 | 737 | ||
865 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | 738 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, |
@@ -880,13 +753,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | |||
880 | unsigned long new_goal; | 753 | unsigned long new_goal; |
881 | 754 | ||
882 | new_goal = MAX_DMA32_PFN << PAGE_SHIFT; | 755 | new_goal = MAX_DMA32_PFN << PAGE_SHIFT; |
883 | #ifdef CONFIG_NO_BOOTMEM | ||
884 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
885 | new_goal, -1ULL); | ||
886 | #else | ||
887 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, | 756 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, |
888 | new_goal, 0); | 757 | new_goal, 0); |
889 | #endif | ||
890 | if (ptr) | 758 | if (ptr) |
891 | return ptr; | 759 | return ptr; |
892 | } | 760 | } |
@@ -907,16 +775,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | |||
907 | void * __init alloc_bootmem_section(unsigned long size, | 775 | void * __init alloc_bootmem_section(unsigned long size, |
908 | unsigned long section_nr) | 776 | unsigned long section_nr) |
909 | { | 777 | { |
910 | #ifdef CONFIG_NO_BOOTMEM | ||
911 | unsigned long pfn, goal, limit; | ||
912 | |||
913 | pfn = section_nr_to_pfn(section_nr); | ||
914 | goal = pfn << PAGE_SHIFT; | ||
915 | limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; | ||
916 | |||
917 | return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, | ||
918 | SMP_CACHE_BYTES, goal, limit); | ||
919 | #else | ||
920 | bootmem_data_t *bdata; | 778 | bootmem_data_t *bdata; |
921 | unsigned long pfn, goal, limit; | 779 | unsigned long pfn, goal, limit; |
922 | 780 | ||
@@ -926,7 +784,6 @@ void * __init alloc_bootmem_section(unsigned long size, | |||
926 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; | 784 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; |
927 | 785 | ||
928 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); | 786 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); |
929 | #endif | ||
930 | } | 787 | } |
931 | #endif | 788 | #endif |
932 | 789 | ||
@@ -938,16 +795,11 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | |||
938 | if (WARN_ON_ONCE(slab_is_available())) | 795 | if (WARN_ON_ONCE(slab_is_available())) |
939 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 796 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
940 | 797 | ||
941 | #ifdef CONFIG_NO_BOOTMEM | ||
942 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
943 | goal, -1ULL); | ||
944 | #else | ||
945 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); | 798 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); |
946 | if (ptr) | 799 | if (ptr) |
947 | return ptr; | 800 | return ptr; |
948 | 801 | ||
949 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); | 802 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); |
950 | #endif | ||
951 | if (ptr) | 803 | if (ptr) |
952 | return ptr; | 804 | return ptr; |
953 | 805 | ||
@@ -995,21 +847,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, | |||
995 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | 847 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, |
996 | unsigned long align, unsigned long goal) | 848 | unsigned long align, unsigned long goal) |
997 | { | 849 | { |
998 | void *ptr; | ||
999 | |||
1000 | if (WARN_ON_ONCE(slab_is_available())) | 850 | if (WARN_ON_ONCE(slab_is_available())) |
1001 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 851 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
1002 | 852 | ||
1003 | #ifdef CONFIG_NO_BOOTMEM | 853 | return ___alloc_bootmem_node(pgdat->bdata, size, align, |
1004 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
1005 | goal, ARCH_LOW_ADDRESS_LIMIT); | 854 | goal, ARCH_LOW_ADDRESS_LIMIT); |
1006 | if (ptr) | ||
1007 | return ptr; | ||
1008 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, | ||
1009 | goal, ARCH_LOW_ADDRESS_LIMIT); | ||
1010 | #else | ||
1011 | ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, | ||
1012 | goal, ARCH_LOW_ADDRESS_LIMIT); | ||
1013 | #endif | ||
1014 | return ptr; | ||
1015 | } | 855 | } |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c new file mode 100644 index 000000000000..e2bdb07079ce --- /dev/null +++ b/mm/nobootmem.c | |||
@@ -0,0 +1,435 @@ | |||
1 | /* | ||
2 | * bootmem - A boot-time physical memory allocator and configurator | ||
3 | * | ||
4 | * Copyright (C) 1999 Ingo Molnar | ||
5 | * 1999 Kanoj Sarcar, SGI | ||
6 | * 2008 Johannes Weiner | ||
7 | * | ||
8 | * Access to this subsystem has to be serialized externally (which is true | ||
9 | * for the boot process anyway). | ||
10 | */ | ||
11 | #include <linux/init.h> | ||
12 | #include <linux/pfn.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <linux/bootmem.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/kmemleak.h> | ||
17 | #include <linux/range.h> | ||
18 | #include <linux/memblock.h> | ||
19 | |||
20 | #include <asm/bug.h> | ||
21 | #include <asm/io.h> | ||
22 | #include <asm/processor.h> | ||
23 | |||
24 | #include "internal.h" | ||
25 | |||
26 | #ifndef CONFIG_NEED_MULTIPLE_NODES | ||
27 | struct pglist_data __refdata contig_page_data; | ||
28 | EXPORT_SYMBOL(contig_page_data); | ||
29 | #endif | ||
30 | |||
31 | unsigned long max_low_pfn; | ||
32 | unsigned long min_low_pfn; | ||
33 | unsigned long max_pfn; | ||
34 | |||
35 | #ifdef CONFIG_CRASH_DUMP | ||
36 | /* | ||
37 | * If we have booted due to a crash, max_pfn will be a very low value. We need | ||
38 | * to know the amount of memory that the previous kernel used. | ||
39 | */ | ||
40 | unsigned long saved_max_pfn; | ||
41 | #endif | ||
42 | |||
43 | static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, | ||
44 | u64 goal, u64 limit) | ||
45 | { | ||
46 | void *ptr; | ||
47 | u64 addr; | ||
48 | |||
49 | if (limit > memblock.current_limit) | ||
50 | limit = memblock.current_limit; | ||
51 | |||
52 | addr = find_memory_core_early(nid, size, align, goal, limit); | ||
53 | |||
54 | if (addr == MEMBLOCK_ERROR) | ||
55 | return NULL; | ||
56 | |||
57 | ptr = phys_to_virt(addr); | ||
58 | memset(ptr, 0, size); | ||
59 | memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); | ||
60 | /* | ||
61 | * The min_count is set to 0 so that bootmem allocated blocks | ||
62 | * are never reported as leaks. | ||
63 | */ | ||
64 | kmemleak_alloc(ptr, size, 0, 0); | ||
65 | return ptr; | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * free_bootmem_late - free bootmem pages directly to page allocator | ||
70 | * @addr: starting address of the range | ||
71 | * @size: size of the range in bytes | ||
72 | * | ||
73 | * This is only useful when the bootmem allocator has already been torn | ||
74 | * down, but we are still initializing the system. Pages are given directly | ||
75 | * to the page allocator, no bootmem metadata is updated because it is gone. | ||
76 | */ | ||
77 | void __init free_bootmem_late(unsigned long addr, unsigned long size) | ||
78 | { | ||
79 | unsigned long cursor, end; | ||
80 | |||
81 | kmemleak_free_part(__va(addr), size); | ||
82 | |||
83 | cursor = PFN_UP(addr); | ||
84 | end = PFN_DOWN(addr + size); | ||
85 | |||
86 | for (; cursor < end; cursor++) { | ||
87 | __free_pages_bootmem(pfn_to_page(cursor), 0); | ||
88 | totalram_pages++; | ||
89 | } | ||
90 | } | ||
91 | |||
92 | static void __init __free_pages_memory(unsigned long start, unsigned long end) | ||
93 | { | ||
94 | int i; | ||
95 | unsigned long start_aligned, end_aligned; | ||
96 | int order = ilog2(BITS_PER_LONG); | ||
97 | |||
98 | start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); | ||
99 | end_aligned = end & ~(BITS_PER_LONG - 1); | ||
100 | |||
101 | if (end_aligned <= start_aligned) { | ||
102 | for (i = start; i < end; i++) | ||
103 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
104 | |||
105 | return; | ||
106 | } | ||
107 | |||
108 | for (i = start; i < start_aligned; i++) | ||
109 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
110 | |||
111 | for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) | ||
112 | __free_pages_bootmem(pfn_to_page(i), order); | ||
113 | |||
114 | for (i = end_aligned; i < end; i++) | ||
115 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
116 | } | ||
117 | |||
118 | unsigned long __init free_all_memory_core_early(int nodeid) | ||
119 | { | ||
120 | int i; | ||
121 | u64 start, end; | ||
122 | unsigned long count = 0; | ||
123 | struct range *range = NULL; | ||
124 | int nr_range; | ||
125 | |||
126 | nr_range = get_free_all_memory_range(&range, nodeid); | ||
127 | |||
128 | for (i = 0; i < nr_range; i++) { | ||
129 | start = range[i].start; | ||
130 | end = range[i].end; | ||
131 | count += end - start; | ||
132 | __free_pages_memory(start, end); | ||
133 | } | ||
134 | |||
135 | return count; | ||
136 | } | ||
137 | |||
138 | /** | ||
139 | * free_all_bootmem_node - release a node's free pages to the buddy allocator | ||
140 | * @pgdat: node to be released | ||
141 | * | ||
142 | * Returns the number of pages actually released. | ||
143 | */ | ||
144 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | ||
145 | { | ||
146 | register_page_bootmem_info_node(pgdat); | ||
147 | |||
148 | /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ | ||
149 | return 0; | ||
150 | } | ||
151 | |||
152 | /** | ||
153 | * free_all_bootmem - release free pages to the buddy allocator | ||
154 | * | ||
155 | * Returns the number of pages actually released. | ||
156 | */ | ||
157 | unsigned long __init free_all_bootmem(void) | ||
158 | { | ||
159 | /* | ||
160 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | ||
161 | * because in some case like Node0 doesnt have RAM installed | ||
162 | * low ram will be on Node1 | ||
163 | * Use MAX_NUMNODES will make sure all ranges in early_node_map[] | ||
164 | * will be used instead of only Node0 related | ||
165 | */ | ||
166 | return free_all_memory_core_early(MAX_NUMNODES); | ||
167 | } | ||
168 | |||
169 | /** | ||
170 | * free_bootmem_node - mark a page range as usable | ||
171 | * @pgdat: node the range resides on | ||
172 | * @physaddr: starting address of the range | ||
173 | * @size: size of the range in bytes | ||
174 | * | ||
175 | * Partial pages will be considered reserved and left as they are. | ||
176 | * | ||
177 | * The range must reside completely on the specified node. | ||
178 | */ | ||
179 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | ||
180 | unsigned long size) | ||
181 | { | ||
182 | kmemleak_free_part(__va(physaddr), size); | ||
183 | memblock_x86_free_range(physaddr, physaddr + size); | ||
184 | } | ||
185 | |||
186 | /** | ||
187 | * free_bootmem - mark a page range as usable | ||
188 | * @addr: starting address of the range | ||
189 | * @size: size of the range in bytes | ||
190 | * | ||
191 | * Partial pages will be considered reserved and left as they are. | ||
192 | * | ||
193 | * The range must be contiguous but may span node boundaries. | ||
194 | */ | ||
195 | void __init free_bootmem(unsigned long addr, unsigned long size) | ||
196 | { | ||
197 | kmemleak_free_part(__va(addr), size); | ||
198 | memblock_x86_free_range(addr, addr + size); | ||
199 | } | ||
200 | |||
201 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, | ||
202 | unsigned long align, | ||
203 | unsigned long goal, | ||
204 | unsigned long limit) | ||
205 | { | ||
206 | void *ptr; | ||
207 | |||
208 | if (WARN_ON_ONCE(slab_is_available())) | ||
209 | return kzalloc(size, GFP_NOWAIT); | ||
210 | |||
211 | restart: | ||
212 | |||
213 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); | ||
214 | |||
215 | if (ptr) | ||
216 | return ptr; | ||
217 | |||
218 | if (goal != 0) { | ||
219 | goal = 0; | ||
220 | goto restart; | ||
221 | } | ||
222 | |||
223 | return NULL; | ||
224 | } | ||
225 | |||
226 | /** | ||
227 | * __alloc_bootmem_nopanic - allocate boot memory without panicking | ||
228 | * @size: size of the request in bytes | ||
229 | * @align: alignment of the region | ||
230 | * @goal: preferred starting address of the region | ||
231 | * | ||
232 | * The goal is dropped if it can not be satisfied and the allocation will | ||
233 | * fall back to memory below @goal. | ||
234 | * | ||
235 | * Allocation may happen on any node in the system. | ||
236 | * | ||
237 | * Returns NULL on failure. | ||
238 | */ | ||
239 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, | ||
240 | unsigned long goal) | ||
241 | { | ||
242 | unsigned long limit = -1UL; | ||
243 | |||
244 | return ___alloc_bootmem_nopanic(size, align, goal, limit); | ||
245 | } | ||
246 | |||
247 | static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, | ||
248 | unsigned long goal, unsigned long limit) | ||
249 | { | ||
250 | void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); | ||
251 | |||
252 | if (mem) | ||
253 | return mem; | ||
254 | /* | ||
255 | * Whoops, we cannot satisfy the allocation request. | ||
256 | */ | ||
257 | printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); | ||
258 | panic("Out of memory"); | ||
259 | return NULL; | ||
260 | } | ||
261 | |||
262 | /** | ||
263 | * __alloc_bootmem - allocate boot memory | ||
264 | * @size: size of the request in bytes | ||
265 | * @align: alignment of the region | ||
266 | * @goal: preferred starting address of the region | ||
267 | * | ||
268 | * The goal is dropped if it can not be satisfied and the allocation will | ||
269 | * fall back to memory below @goal. | ||
270 | * | ||
271 | * Allocation may happen on any node in the system. | ||
272 | * | ||
273 | * The function panics if the request can not be satisfied. | ||
274 | */ | ||
275 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, | ||
276 | unsigned long goal) | ||
277 | { | ||
278 | unsigned long limit = -1UL; | ||
279 | |||
280 | return ___alloc_bootmem(size, align, goal, limit); | ||
281 | } | ||
282 | |||
283 | /** | ||
284 | * __alloc_bootmem_node - allocate boot memory from a specific node | ||
285 | * @pgdat: node to allocate from | ||
286 | * @size: size of the request in bytes | ||
287 | * @align: alignment of the region | ||
288 | * @goal: preferred starting address of the region | ||
289 | * | ||
290 | * The goal is dropped if it can not be satisfied and the allocation will | ||
291 | * fall back to memory below @goal. | ||
292 | * | ||
293 | * Allocation may fall back to any node in the system if the specified node | ||
294 | * can not hold the requested memory. | ||
295 | * | ||
296 | * The function panics if the request can not be satisfied. | ||
297 | */ | ||
298 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | ||
299 | unsigned long align, unsigned long goal) | ||
300 | { | ||
301 | void *ptr; | ||
302 | |||
303 | if (WARN_ON_ONCE(slab_is_available())) | ||
304 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
305 | |||
306 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
307 | goal, -1ULL); | ||
308 | if (ptr) | ||
309 | return ptr; | ||
310 | |||
311 | return __alloc_memory_core_early(MAX_NUMNODES, size, align, | ||
312 | goal, -1ULL); | ||
313 | } | ||
314 | |||
315 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | ||
316 | unsigned long align, unsigned long goal) | ||
317 | { | ||
318 | #ifdef MAX_DMA32_PFN | ||
319 | unsigned long end_pfn; | ||
320 | |||
321 | if (WARN_ON_ONCE(slab_is_available())) | ||
322 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
323 | |||
324 | /* update goal according ...MAX_DMA32_PFN */ | ||
325 | end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; | ||
326 | |||
327 | if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && | ||
328 | (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { | ||
329 | void *ptr; | ||
330 | unsigned long new_goal; | ||
331 | |||
332 | new_goal = MAX_DMA32_PFN << PAGE_SHIFT; | ||
333 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
334 | new_goal, -1ULL); | ||
335 | if (ptr) | ||
336 | return ptr; | ||
337 | } | ||
338 | #endif | ||
339 | |||
340 | return __alloc_bootmem_node(pgdat, size, align, goal); | ||
341 | |||
342 | } | ||
343 | |||
344 | #ifdef CONFIG_SPARSEMEM | ||
345 | /** | ||
346 | * alloc_bootmem_section - allocate boot memory from a specific section | ||
347 | * @size: size of the request in bytes | ||
348 | * @section_nr: sparse map section to allocate from | ||
349 | * | ||
350 | * Return NULL on failure. | ||
351 | */ | ||
352 | void * __init alloc_bootmem_section(unsigned long size, | ||
353 | unsigned long section_nr) | ||
354 | { | ||
355 | unsigned long pfn, goal, limit; | ||
356 | |||
357 | pfn = section_nr_to_pfn(section_nr); | ||
358 | goal = pfn << PAGE_SHIFT; | ||
359 | limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; | ||
360 | |||
361 | return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, | ||
362 | SMP_CACHE_BYTES, goal, limit); | ||
363 | } | ||
364 | #endif | ||
365 | |||
366 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | ||
367 | unsigned long align, unsigned long goal) | ||
368 | { | ||
369 | void *ptr; | ||
370 | |||
371 | if (WARN_ON_ONCE(slab_is_available())) | ||
372 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
373 | |||
374 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
375 | goal, -1ULL); | ||
376 | if (ptr) | ||
377 | return ptr; | ||
378 | |||
379 | return __alloc_bootmem_nopanic(size, align, goal); | ||
380 | } | ||
381 | |||
382 | #ifndef ARCH_LOW_ADDRESS_LIMIT | ||
383 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL | ||
384 | #endif | ||
385 | |||
386 | /** | ||
387 | * __alloc_bootmem_low - allocate low boot memory | ||
388 | * @size: size of the request in bytes | ||
389 | * @align: alignment of the region | ||
390 | * @goal: preferred starting address of the region | ||
391 | * | ||
392 | * The goal is dropped if it can not be satisfied and the allocation will | ||
393 | * fall back to memory below @goal. | ||
394 | * | ||
395 | * Allocation may happen on any node in the system. | ||
396 | * | ||
397 | * The function panics if the request can not be satisfied. | ||
398 | */ | ||
399 | void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, | ||
400 | unsigned long goal) | ||
401 | { | ||
402 | return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); | ||
403 | } | ||
404 | |||
405 | /** | ||
406 | * __alloc_bootmem_low_node - allocate low boot memory from a specific node | ||
407 | * @pgdat: node to allocate from | ||
408 | * @size: size of the request in bytes | ||
409 | * @align: alignment of the region | ||
410 | * @goal: preferred starting address of the region | ||
411 | * | ||
412 | * The goal is dropped if it can not be satisfied and the allocation will | ||
413 | * fall back to memory below @goal. | ||
414 | * | ||
415 | * Allocation may fall back to any node in the system if the specified node | ||
416 | * can not hold the requested memory. | ||
417 | * | ||
418 | * The function panics if the request can not be satisfied. | ||
419 | */ | ||
420 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | ||
421 | unsigned long align, unsigned long goal) | ||
422 | { | ||
423 | void *ptr; | ||
424 | |||
425 | if (WARN_ON_ONCE(slab_is_available())) | ||
426 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
427 | |||
428 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
429 | goal, ARCH_LOW_ADDRESS_LIMIT); | ||
430 | if (ptr) | ||
431 | return ptr; | ||
432 | |||
433 | return __alloc_memory_core_early(MAX_NUMNODES, size, align, | ||
434 | goal, ARCH_LOW_ADDRESS_LIMIT); | ||
435 | } | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cdef1d4b4e47..bd7625676a64 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -3699,13 +3699,45 @@ void __init free_bootmem_with_active_regions(int nid, | |||
3699 | } | 3699 | } |
3700 | 3700 | ||
3701 | #ifdef CONFIG_HAVE_MEMBLOCK | 3701 | #ifdef CONFIG_HAVE_MEMBLOCK |
3702 | /* | ||
3703 | * Basic iterator support. Return the last range of PFNs for a node | ||
3704 | * Note: nid == MAX_NUMNODES returns last region regardless of node | ||
3705 | */ | ||
3706 | static int __meminit last_active_region_index_in_nid(int nid) | ||
3707 | { | ||
3708 | int i; | ||
3709 | |||
3710 | for (i = nr_nodemap_entries - 1; i >= 0; i--) | ||
3711 | if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) | ||
3712 | return i; | ||
3713 | |||
3714 | return -1; | ||
3715 | } | ||
3716 | |||
3717 | /* | ||
3718 | * Basic iterator support. Return the previous active range of PFNs for a node | ||
3719 | * Note: nid == MAX_NUMNODES returns next region regardless of node | ||
3720 | */ | ||
3721 | static int __meminit previous_active_region_index_in_nid(int index, int nid) | ||
3722 | { | ||
3723 | for (index = index - 1; index >= 0; index--) | ||
3724 | if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) | ||
3725 | return index; | ||
3726 | |||
3727 | return -1; | ||
3728 | } | ||
3729 | |||
3730 | #define for_each_active_range_index_in_nid_reverse(i, nid) \ | ||
3731 | for (i = last_active_region_index_in_nid(nid); i != -1; \ | ||
3732 | i = previous_active_region_index_in_nid(i, nid)) | ||
3733 | |||
3702 | u64 __init find_memory_core_early(int nid, u64 size, u64 align, | 3734 | u64 __init find_memory_core_early(int nid, u64 size, u64 align, |
3703 | u64 goal, u64 limit) | 3735 | u64 goal, u64 limit) |
3704 | { | 3736 | { |
3705 | int i; | 3737 | int i; |
3706 | 3738 | ||
3707 | /* Need to go over early_node_map to find out good range for node */ | 3739 | /* Need to go over early_node_map to find out good range for node */ |
3708 | for_each_active_range_index_in_nid(i, nid) { | 3740 | for_each_active_range_index_in_nid_reverse(i, nid) { |
3709 | u64 addr; | 3741 | u64 addr; |
3710 | u64 ei_start, ei_last; | 3742 | u64 ei_start, ei_last; |
3711 | u64 final_start, final_end; | 3743 | u64 final_start, final_end; |
@@ -3748,34 +3780,6 @@ int __init add_from_early_node_map(struct range *range, int az, | |||
3748 | return nr_range; | 3780 | return nr_range; |
3749 | } | 3781 | } |
3750 | 3782 | ||
3751 | #ifdef CONFIG_NO_BOOTMEM | ||
3752 | void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, | ||
3753 | u64 goal, u64 limit) | ||
3754 | { | ||
3755 | void *ptr; | ||
3756 | u64 addr; | ||
3757 | |||
3758 | if (limit > memblock.current_limit) | ||
3759 | limit = memblock.current_limit; | ||
3760 | |||
3761 | addr = find_memory_core_early(nid, size, align, goal, limit); | ||
3762 | |||
3763 | if (addr == MEMBLOCK_ERROR) | ||
3764 | return NULL; | ||
3765 | |||
3766 | ptr = phys_to_virt(addr); | ||
3767 | memset(ptr, 0, size); | ||
3768 | memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); | ||
3769 | /* | ||
3770 | * The min_count is set to 0 so that bootmem allocated blocks | ||
3771 | * are never reported as leaks. | ||
3772 | */ | ||
3773 | kmemleak_alloc(ptr, size, 0, 0); | ||
3774 | return ptr; | ||
3775 | } | ||
3776 | #endif | ||
3777 | |||
3778 | |||
3779 | void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) | 3783 | void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) |
3780 | { | 3784 | { |
3781 | int i; | 3785 | int i; |
@@ -4809,15 +4813,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) | |||
4809 | dma_reserve = new_dma_reserve; | 4813 | dma_reserve = new_dma_reserve; |
4810 | } | 4814 | } |
4811 | 4815 | ||
4812 | #ifndef CONFIG_NEED_MULTIPLE_NODES | ||
4813 | struct pglist_data __refdata contig_page_data = { | ||
4814 | #ifndef CONFIG_NO_BOOTMEM | ||
4815 | .bdata = &bootmem_node_data[0] | ||
4816 | #endif | ||
4817 | }; | ||
4818 | EXPORT_SYMBOL(contig_page_data); | ||
4819 | #endif | ||
4820 | |||
4821 | void __init free_area_init(unsigned long *zones_size) | 4816 | void __init free_area_init(unsigned long *zones_size) |
4822 | { | 4817 | { |
4823 | free_area_init_node(0, zones_size, | 4818 | free_area_init_node(0, zones_size, |