diff options
Diffstat (limited to 'arch/x86/xen/setup.c')
-rw-r--r-- | arch/x86/xen/setup.c | 294 |
1 files changed, 138 insertions, 156 deletions
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 73daaf75801a..f5e1362550e7 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/pm.h> | 10 | #include <linux/pm.h> |
11 | #include <linux/memblock.h> | 11 | #include <linux/memblock.h> |
12 | #include <linux/cpuidle.h> | ||
12 | 13 | ||
13 | #include <asm/elf.h> | 14 | #include <asm/elf.h> |
14 | #include <asm/vdso.h> | 15 | #include <asm/vdso.h> |
@@ -36,7 +37,10 @@ extern void xen_syscall_target(void); | |||
36 | extern void xen_syscall32_target(void); | 37 | extern void xen_syscall32_target(void); |
37 | 38 | ||
38 | /* Amount of extra memory space we add to the e820 ranges */ | 39 | /* Amount of extra memory space we add to the e820 ranges */ |
39 | phys_addr_t xen_extra_mem_start, xen_extra_mem_size; | 40 | struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; |
41 | |||
42 | /* Number of pages released from the initial allocation. */ | ||
43 | unsigned long xen_released_pages; | ||
40 | 44 | ||
41 | /* | 45 | /* |
42 | * The maximum amount of extra memory compared to the base size. The | 46 | * The maximum amount of extra memory compared to the base size. The |
@@ -50,50 +54,47 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size; | |||
50 | */ | 54 | */ |
51 | #define EXTRA_MEM_RATIO (10) | 55 | #define EXTRA_MEM_RATIO (10) |
52 | 56 | ||
53 | static void __init xen_add_extra_mem(unsigned long pages) | 57 | static void __init xen_add_extra_mem(u64 start, u64 size) |
54 | { | 58 | { |
55 | unsigned long pfn; | 59 | unsigned long pfn; |
60 | int i; | ||
56 | 61 | ||
57 | u64 size = (u64)pages * PAGE_SIZE; | 62 | for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { |
58 | u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; | 63 | /* Add new region. */ |
59 | 64 | if (xen_extra_mem[i].size == 0) { | |
60 | if (!pages) | 65 | xen_extra_mem[i].start = start; |
61 | return; | 66 | xen_extra_mem[i].size = size; |
62 | 67 | break; | |
63 | e820_add_region(extra_start, size, E820_RAM); | 68 | } |
64 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | 69 | /* Append to existing region. */ |
65 | 70 | if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { | |
66 | memblock_reserve(extra_start, size); | 71 | xen_extra_mem[i].size += size; |
72 | break; | ||
73 | } | ||
74 | } | ||
75 | if (i == XEN_EXTRA_MEM_MAX_REGIONS) | ||
76 | printk(KERN_WARNING "Warning: not enough extra memory regions\n"); | ||
67 | 77 | ||
68 | xen_extra_mem_size += size; | 78 | memblock_reserve(start, size); |
69 | 79 | ||
70 | xen_max_p2m_pfn = PFN_DOWN(extra_start + size); | 80 | xen_max_p2m_pfn = PFN_DOWN(start + size); |
71 | 81 | ||
72 | for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++) | 82 | for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++) |
73 | __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); | 83 | __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); |
74 | } | 84 | } |
75 | 85 | ||
76 | static unsigned long __init xen_release_chunk(phys_addr_t start_addr, | 86 | static unsigned long __init xen_release_chunk(unsigned long start, |
77 | phys_addr_t end_addr) | 87 | unsigned long end) |
78 | { | 88 | { |
79 | struct xen_memory_reservation reservation = { | 89 | struct xen_memory_reservation reservation = { |
80 | .address_bits = 0, | 90 | .address_bits = 0, |
81 | .extent_order = 0, | 91 | .extent_order = 0, |
82 | .domid = DOMID_SELF | 92 | .domid = DOMID_SELF |
83 | }; | 93 | }; |
84 | unsigned long start, end; | ||
85 | unsigned long len = 0; | 94 | unsigned long len = 0; |
86 | unsigned long pfn; | 95 | unsigned long pfn; |
87 | int ret; | 96 | int ret; |
88 | 97 | ||
89 | start = PFN_UP(start_addr); | ||
90 | end = PFN_DOWN(end_addr); | ||
91 | |||
92 | if (end <= start) | ||
93 | return 0; | ||
94 | |||
95 | printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ", | ||
96 | start, end); | ||
97 | for(pfn = start; pfn < end; pfn++) { | 98 | for(pfn = start; pfn < end; pfn++) { |
98 | unsigned long mfn = pfn_to_mfn(pfn); | 99 | unsigned long mfn = pfn_to_mfn(pfn); |
99 | 100 | ||
@@ -106,100 +107,104 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr, | |||
106 | 107 | ||
107 | ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, | 108 | ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, |
108 | &reservation); | 109 | &reservation); |
109 | WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", | 110 | WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); |
110 | start, end, ret); | ||
111 | if (ret == 1) { | 111 | if (ret == 1) { |
112 | __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); | 112 | __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); |
113 | len++; | 113 | len++; |
114 | } | 114 | } |
115 | } | 115 | } |
116 | printk(KERN_CONT "%ld pages freed\n", len); | 116 | printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", |
117 | start, end, len); | ||
117 | 118 | ||
118 | return len; | 119 | return len; |
119 | } | 120 | } |
120 | 121 | ||
121 | static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, | 122 | static unsigned long __init xen_set_identity_and_release( |
122 | const struct e820map *e820) | 123 | const struct e820entry *list, size_t map_size, unsigned long nr_pages) |
123 | { | 124 | { |
124 | phys_addr_t max_addr = PFN_PHYS(max_pfn); | 125 | phys_addr_t start = 0; |
125 | phys_addr_t last_end = ISA_END_ADDRESS; | ||
126 | unsigned long released = 0; | 126 | unsigned long released = 0; |
127 | unsigned long identity = 0; | ||
128 | const struct e820entry *entry; | ||
127 | int i; | 129 | int i; |
128 | 130 | ||
129 | /* Free any unused memory above the low 1Mbyte. */ | 131 | /* |
130 | for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { | 132 | * Combine non-RAM regions and gaps until a RAM region (or the |
131 | phys_addr_t end = e820->map[i].addr; | 133 | * end of the map) is reached, then set the 1:1 map and |
132 | end = min(max_addr, end); | 134 | * release the pages (if available) in those non-RAM regions. |
135 | * | ||
136 | * The combined non-RAM regions are rounded to a whole number | ||
137 | * of pages so any partial pages are accessible via the 1:1 | ||
138 | * mapping. This is needed for some BIOSes that put (for | ||
139 | * example) the DMI tables in a reserved region that begins on | ||
140 | * a non-page boundary. | ||
141 | */ | ||
142 | for (i = 0, entry = list; i < map_size; i++, entry++) { | ||
143 | phys_addr_t end = entry->addr + entry->size; | ||
144 | |||
145 | if (entry->type == E820_RAM || i == map_size - 1) { | ||
146 | unsigned long start_pfn = PFN_DOWN(start); | ||
147 | unsigned long end_pfn = PFN_UP(end); | ||
133 | 148 | ||
134 | if (last_end < end) | 149 | if (entry->type == E820_RAM) |
135 | released += xen_release_chunk(last_end, end); | 150 | end_pfn = PFN_UP(entry->addr); |
136 | last_end = max(last_end, e820->map[i].addr + e820->map[i].size); | 151 | |
152 | if (start_pfn < end_pfn) { | ||
153 | if (start_pfn < nr_pages) | ||
154 | released += xen_release_chunk( | ||
155 | start_pfn, min(end_pfn, nr_pages)); | ||
156 | |||
157 | identity += set_phys_range_identity( | ||
158 | start_pfn, end_pfn); | ||
159 | } | ||
160 | start = end; | ||
161 | } | ||
137 | } | 162 | } |
138 | 163 | ||
139 | if (last_end < max_addr) | 164 | printk(KERN_INFO "Released %lu pages of unused memory\n", released); |
140 | released += xen_release_chunk(last_end, max_addr); | 165 | printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); |
141 | 166 | ||
142 | printk(KERN_INFO "released %ld pages of unused memory\n", released); | ||
143 | return released; | 167 | return released; |
144 | } | 168 | } |
145 | 169 | ||
146 | static unsigned long __init xen_set_identity(const struct e820entry *list, | 170 | static unsigned long __init xen_get_max_pages(void) |
147 | ssize_t map_size) | ||
148 | { | 171 | { |
149 | phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS; | 172 | unsigned long max_pages = MAX_DOMAIN_PAGES; |
150 | phys_addr_t start_pci = last; | 173 | domid_t domid = DOMID_SELF; |
151 | const struct e820entry *entry; | 174 | int ret; |
152 | unsigned long identity = 0; | ||
153 | int i; | ||
154 | |||
155 | for (i = 0, entry = list; i < map_size; i++, entry++) { | ||
156 | phys_addr_t start = entry->addr; | ||
157 | phys_addr_t end = start + entry->size; | ||
158 | |||
159 | if (start < last) | ||
160 | start = last; | ||
161 | |||
162 | if (end <= start) | ||
163 | continue; | ||
164 | 175 | ||
165 | /* Skip over the 1MB region. */ | 176 | ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); |
166 | if (last > end) | 177 | if (ret > 0) |
167 | continue; | 178 | max_pages = ret; |
179 | return min(max_pages, MAX_DOMAIN_PAGES); | ||
180 | } | ||
168 | 181 | ||
169 | if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) { | 182 | static void xen_align_and_add_e820_region(u64 start, u64 size, int type) |
170 | if (start > start_pci) | 183 | { |
171 | identity += set_phys_range_identity( | 184 | u64 end = start + size; |
172 | PFN_UP(start_pci), PFN_DOWN(start)); | ||
173 | 185 | ||
174 | /* Without saving 'last' we would gooble RAM too | 186 | /* Align RAM regions to page boundaries. */ |
175 | * at the end of the loop. */ | 187 | if (type == E820_RAM) { |
176 | last = end; | 188 | start = PAGE_ALIGN(start); |
177 | start_pci = end; | 189 | end &= ~((u64)PAGE_SIZE - 1); |
178 | continue; | ||
179 | } | ||
180 | start_pci = min(start, start_pci); | ||
181 | last = end; | ||
182 | } | 190 | } |
183 | if (last > start_pci) | 191 | |
184 | identity += set_phys_range_identity( | 192 | e820_add_region(start, end - start, type); |
185 | PFN_UP(start_pci), PFN_DOWN(last)); | ||
186 | return identity; | ||
187 | } | 193 | } |
194 | |||
188 | /** | 195 | /** |
189 | * machine_specific_memory_setup - Hook for machine specific memory setup. | 196 | * machine_specific_memory_setup - Hook for machine specific memory setup. |
190 | **/ | 197 | **/ |
191 | char * __init xen_memory_setup(void) | 198 | char * __init xen_memory_setup(void) |
192 | { | 199 | { |
193 | static struct e820entry map[E820MAX] __initdata; | 200 | static struct e820entry map[E820MAX] __initdata; |
194 | static struct e820entry map_raw[E820MAX] __initdata; | ||
195 | 201 | ||
196 | unsigned long max_pfn = xen_start_info->nr_pages; | 202 | unsigned long max_pfn = xen_start_info->nr_pages; |
197 | unsigned long long mem_end; | 203 | unsigned long long mem_end; |
198 | int rc; | 204 | int rc; |
199 | struct xen_memory_map memmap; | 205 | struct xen_memory_map memmap; |
206 | unsigned long max_pages; | ||
200 | unsigned long extra_pages = 0; | 207 | unsigned long extra_pages = 0; |
201 | unsigned long extra_limit; | ||
202 | unsigned long identity_pages = 0; | ||
203 | int i; | 208 | int i; |
204 | int op; | 209 | int op; |
205 | 210 | ||
@@ -225,58 +230,65 @@ char * __init xen_memory_setup(void) | |||
225 | } | 230 | } |
226 | BUG_ON(rc); | 231 | BUG_ON(rc); |
227 | 232 | ||
228 | memcpy(map_raw, map, sizeof(map)); | 233 | /* Make sure the Xen-supplied memory map is well-ordered. */ |
229 | e820.nr_map = 0; | 234 | sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); |
230 | xen_extra_mem_start = mem_end; | 235 | |
231 | for (i = 0; i < memmap.nr_entries; i++) { | 236 | max_pages = xen_get_max_pages(); |
232 | unsigned long long end; | 237 | if (max_pages > max_pfn) |
233 | 238 | extra_pages += max_pages - max_pfn; | |
234 | /* Guard against non-page aligned E820 entries. */ | 239 | |
235 | if (map[i].type == E820_RAM) | 240 | /* |
236 | map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE; | 241 | * Set P2M for all non-RAM pages and E820 gaps to be identity |
237 | 242 | * type PFNs. Any RAM pages that would be made inaccesible by | |
238 | end = map[i].addr + map[i].size; | 243 | * this are first released. |
239 | if (map[i].type == E820_RAM && end > mem_end) { | 244 | */ |
240 | /* RAM off the end - may be partially included */ | 245 | xen_released_pages = xen_set_identity_and_release( |
241 | u64 delta = min(map[i].size, end - mem_end); | 246 | map, memmap.nr_entries, max_pfn); |
242 | 247 | extra_pages += xen_released_pages; | |
243 | map[i].size -= delta; | 248 | |
244 | end -= delta; | 249 | /* |
245 | 250 | * Clamp the amount of extra memory to a EXTRA_MEM_RATIO | |
246 | extra_pages += PFN_DOWN(delta); | 251 | * factor the base size. On non-highmem systems, the base |
247 | /* | 252 | * size is the full initial memory allocation; on highmem it |
248 | * Set RAM below 4GB that is not for us to be unusable. | 253 | * is limited to the max size of lowmem, so that it doesn't |
249 | * This prevents "System RAM" address space from being | 254 | * get completely filled. |
250 | * used as potential resource for I/O address (happens | 255 | * |
251 | * when 'allocate_resource' is called). | 256 | * In principle there could be a problem in lowmem systems if |
252 | */ | 257 | * the initial memory is also very large with respect to |
253 | if (delta && | 258 | * lowmem, but we won't try to deal with that here. |
254 | (xen_initial_domain() && end < 0x100000000ULL)) | 259 | */ |
255 | e820_add_region(end, delta, E820_UNUSABLE); | 260 | extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), |
261 | extra_pages); | ||
262 | |||
263 | i = 0; | ||
264 | while (i < memmap.nr_entries) { | ||
265 | u64 addr = map[i].addr; | ||
266 | u64 size = map[i].size; | ||
267 | u32 type = map[i].type; | ||
268 | |||
269 | if (type == E820_RAM) { | ||
270 | if (addr < mem_end) { | ||
271 | size = min(size, mem_end - addr); | ||
272 | } else if (extra_pages) { | ||
273 | size = min(size, (u64)extra_pages * PAGE_SIZE); | ||
274 | extra_pages -= size / PAGE_SIZE; | ||
275 | xen_add_extra_mem(addr, size); | ||
276 | } else | ||
277 | type = E820_UNUSABLE; | ||
256 | } | 278 | } |
257 | 279 | ||
258 | if (map[i].size > 0 && end > xen_extra_mem_start) | 280 | xen_align_and_add_e820_region(addr, size, type); |
259 | xen_extra_mem_start = end; | ||
260 | 281 | ||
261 | /* Add region if any remains */ | 282 | map[i].addr += size; |
262 | if (map[i].size > 0) | 283 | map[i].size -= size; |
263 | e820_add_region(map[i].addr, map[i].size, map[i].type); | 284 | if (map[i].size == 0) |
285 | i++; | ||
264 | } | 286 | } |
265 | /* Align the balloon area so that max_low_pfn does not get set | ||
266 | * to be at the _end_ of the PCI gap at the far end (fee01000). | ||
267 | * Note that xen_extra_mem_start gets set in the loop above to be | ||
268 | * past the last E820 region. */ | ||
269 | if (xen_initial_domain() && (xen_extra_mem_start < (1ULL<<32))) | ||
270 | xen_extra_mem_start = (1ULL<<32); | ||
271 | 287 | ||
272 | /* | 288 | /* |
273 | * In domU, the ISA region is normal, usable memory, but we | 289 | * In domU, the ISA region is normal, usable memory, but we |
274 | * reserve ISA memory anyway because too many things poke | 290 | * reserve ISA memory anyway because too many things poke |
275 | * about in there. | 291 | * about in there. |
276 | * | ||
277 | * In Dom0, the host E820 information can leave gaps in the | ||
278 | * ISA range, which would cause us to release those pages. To | ||
279 | * avoid this, we unconditionally reserve them here. | ||
280 | */ | 292 | */ |
281 | e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, | 293 | e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, |
282 | E820_RESERVED); | 294 | E820_RESERVED); |
@@ -292,36 +304,6 @@ char * __init xen_memory_setup(void) | |||
292 | 304 | ||
293 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | 305 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); |
294 | 306 | ||
295 | extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); | ||
296 | |||
297 | /* | ||
298 | * Clamp the amount of extra memory to a EXTRA_MEM_RATIO | ||
299 | * factor the base size. On non-highmem systems, the base | ||
300 | * size is the full initial memory allocation; on highmem it | ||
301 | * is limited to the max size of lowmem, so that it doesn't | ||
302 | * get completely filled. | ||
303 | * | ||
304 | * In principle there could be a problem in lowmem systems if | ||
305 | * the initial memory is also very large with respect to | ||
306 | * lowmem, but we won't try to deal with that here. | ||
307 | */ | ||
308 | extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), | ||
309 | max_pfn + extra_pages); | ||
310 | |||
311 | if (extra_limit >= max_pfn) | ||
312 | extra_pages = extra_limit - max_pfn; | ||
313 | else | ||
314 | extra_pages = 0; | ||
315 | |||
316 | xen_add_extra_mem(extra_pages); | ||
317 | |||
318 | /* | ||
319 | * Set P2M for all non-RAM pages and E820 gaps to be identity | ||
320 | * type PFNs. We supply it with the non-sanitized version | ||
321 | * of the E820. | ||
322 | */ | ||
323 | identity_pages = xen_set_identity(map_raw, memmap.nr_entries); | ||
324 | printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages); | ||
325 | return "Xen"; | 307 | return "Xen"; |
326 | } | 308 | } |
327 | 309 | ||
@@ -425,7 +407,7 @@ void __init xen_arch_setup(void) | |||
425 | #ifdef CONFIG_X86_32 | 407 | #ifdef CONFIG_X86_32 |
426 | boot_cpu_data.hlt_works_ok = 1; | 408 | boot_cpu_data.hlt_works_ok = 1; |
427 | #endif | 409 | #endif |
428 | pm_idle = default_idle; | 410 | disable_cpuidle(); |
429 | boot_option_idle_override = IDLE_HALT; | 411 | boot_option_idle_override = IDLE_HALT; |
430 | 412 | ||
431 | fiddle_vdso(); | 413 | fiddle_vdso(); |