aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2012-04-06 10:07:11 -0400
committerKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2012-05-07 15:31:46 -0400
commit2e2fb75475c2fc74c98100f1468c8195fee49f3b (patch)
treec0ce4180a2efb77456681a0d340371e10129124c /arch/x86
parentca1182387e57470460294ce1e39e2d5518809811 (diff)
xen/setup: Populate freed MFNs from non-RAM E820 entries and gaps to E820 RAM
When the Xen hypervisor boots a PV kernel it hands it two pieces of information: nr_pages and a made up E820 entry. The nr_pages value defines the range from zero to nr_pages of PFNs which have a valid Machine Frame Number (MFN) underneath it. The E820 mirrors that (with the VGA hole): BIOS-provided physical RAM map: Xen: 0000000000000000 - 00000000000a0000 (usable) Xen: 00000000000a0000 - 0000000000100000 (reserved) Xen: 0000000000100000 - 0000000080800000 (usable) The fun comes when a PV guest that is run with a machine E820 - that can either be the initial domain or a PCI PV guest, where the E820 looks like the normal thing: BIOS-provided physical RAM map: Xen: 0000000000000000 - 000000000009e000 (usable) Xen: 000000000009ec00 - 0000000000100000 (reserved) Xen: 0000000000100000 - 0000000020000000 (usable) Xen: 0000000020000000 - 0000000020200000 (reserved) Xen: 0000000020200000 - 0000000040000000 (usable) Xen: 0000000040000000 - 0000000040200000 (reserved) Xen: 0000000040200000 - 00000000bad80000 (usable) Xen: 00000000bad80000 - 00000000badc9000 (ACPI NVS) .. With that overlaying the nr_pages directly on the E820 does not work as there are gaps and non-RAM regions that won't be used by the memory allocator. The 'xen_release_chunk' helps with that by punching holes in the P2M (PFN to MFN lookup tree) for those regions and tells us that: Freeing 20000-20200 pfn range: 512 pages freed Freeing 40000-40200 pfn range: 512 pages freed Freeing bad80-badf4 pfn range: 116 pages freed Freeing badf6-bae7f pfn range: 137 pages freed Freeing bb000-100000 pfn range: 282624 pages freed Released 283999 pages of unused memory Those 283999 pages are subtracted from the nr_pages and are returned to the hypervisor. The end result is that the initial domain boots with 1GB less memory as the nr_pages has been subtracted by the amount of pages residing within the PCI hole. It can balloon up to that if desired using 'xl mem-set 0 8092', but the balloon driver is not always compiled in for the initial domain. This patch, implements the populate hypercall (XENMEM_populate_physmap) which increases the the domain with the same amount of pages that were released. The other solution (that did not work) was to transplant the MFN in the P2M tree - the ones that were going to be freed were put in the E820_RAM regions past the nr_pages. But the modifications to the M2P array (the other side of creating PTEs) were not carried away. As the hypervisor is the only one capable of modifying that and the only two hypercalls that would do this are: the update_va_mapping (which won't work, as during initial bootup only PFNs up to nr_pages are mapped in the guest) or via the populate hypercall. The end result is that the kernel can now boot with the nr_pages without having to subtract the 283999 pages. On a 8GB machine, with various dom0_mem= parameters this is what we get: no dom0_mem -Memory: 6485264k/9435136k available (5817k kernel code, 1136060k absent, 1813812k reserved, 2899k data, 696k init) +Memory: 7619036k/9435136k available (5817k kernel code, 1136060k absent, 680040k reserved, 2899k data, 696k init) dom0_mem=3G -Memory: 2616536k/9435136k available (5817k kernel code, 1136060k absent, 5682540k reserved, 2899k data, 696k init) +Memory: 2703776k/9435136k available (5817k kernel code, 1136060k absent, 5595300k reserved, 2899k data, 696k init) dom0_mem=max:3G -Memory: 2696732k/4281724k available (5817k kernel code, 1136060k absent, 448932k reserved, 2899k data, 696k init) +Memory: 2702204k/4281724k available (5817k kernel code, 1136060k absent, 443460k reserved, 2899k data, 696k init) And the 'xm list' or 'xl list' now reflect what the dom0_mem= argument is. Acked-by: David Vrabel <david.vrabel@citrix.com> [v2: Use populate hypercall] [v3: Remove debug printks] [v4: Simplify code] Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/xen/setup.c116
1 files changed, 112 insertions, 4 deletions
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 7b0ab77b8479..710af36e6dfb 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -26,7 +26,6 @@
26#include <xen/interface/memory.h> 26#include <xen/interface/memory.h>
27#include <xen/interface/physdev.h> 27#include <xen/interface/physdev.h>
28#include <xen/features.h> 28#include <xen/features.h>
29
30#include "xen-ops.h" 29#include "xen-ops.h"
31#include "vdso.h" 30#include "vdso.h"
32 31
@@ -120,7 +119,105 @@ static unsigned long __init xen_release_chunk(unsigned long start,
120 119
121 return len; 120 return len;
122} 121}
122static unsigned long __init xen_populate_physmap(unsigned long start,
123 unsigned long end)
124{
125 struct xen_memory_reservation reservation = {
126 .address_bits = 0,
127 .extent_order = 0,
128 .domid = DOMID_SELF
129 };
130 unsigned long len = 0;
131 int ret;
132
133 for (pfn = start; pfn < end; pfn++) {
134 unsigned long frame;
135
136 /* Make sure pfn does not exists to start with */
137 if (pfn_to_mfn(pfn) != INVALID_P2M_ENTRY)
138 continue;
123 139
140 frame = pfn;
141 set_xen_guest_handle(reservation.extent_start, &frame);
142 reservation.nr_extents = 1;
143
144 ret = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
145 WARN(ret != 1, "Failed to populate pfn %lx err=%d\n", pfn, ret);
146 if (ret == 1) {
147 if (!early_set_phys_to_machine(pfn, frame)) {
148 set_xen_guest_handle(reservation.extent_start, &frame);
149 reservation.nr_extents = 1;
150 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
151 &reservation);
152 break;
153 }
154 len++;
155 } else
156 break;
157 }
158 if (len)
159 printk(KERN_INFO "Populated %lx-%lx pfn range: %lu pages added\n",
160 start, end, len);
161 return len;
162}
163static unsigned long __init xen_populate_chunk(
164 const struct e820entry *list, size_t map_size,
165 unsigned long max_pfn, unsigned long *last_pfn,
166 unsigned long credits_left)
167{
168 const struct e820entry *entry;
169 unsigned int i;
170 unsigned long done = 0;
171 unsigned long dest_pfn;
172
173 for (i = 0, entry = list; i < map_size; i++, entry++) {
174 unsigned long credits = credits_left;
175 unsigned long s_pfn;
176 unsigned long e_pfn;
177 unsigned long pfns;
178 long capacity;
179
180 if (credits <= 0)
181 break;
182
183 if (entry->type != E820_RAM)
184 continue;
185
186 e_pfn = PFN_UP(entry->addr + entry->size);
187
188 /* We only care about E820 after the xen_start_info->nr_pages */
189 if (e_pfn <= max_pfn)
190 continue;
191
192 s_pfn = PFN_DOWN(entry->addr);
193 /* If the E820 falls within the nr_pages, we want to start
194 * at the nr_pages PFN.
195 * If that would mean going past the E820 entry, skip it
196 */
197 if (s_pfn <= max_pfn) {
198 capacity = e_pfn - max_pfn;
199 dest_pfn = max_pfn;
200 } else {
201 /* last_pfn MUST be within E820_RAM regions */
202 if (*last_pfn && e_pfn >= *last_pfn)
203 s_pfn = *last_pfn;
204 capacity = e_pfn - s_pfn;
205 dest_pfn = s_pfn;
206 }
207 /* If we had filled this E820_RAM entry, go to the next one. */
208 if (capacity <= 0)
209 continue;
210
211 if (credits > capacity)
212 credits = capacity;
213
214 pfns = xen_populate_physmap(dest_pfn, dest_pfn + credits);
215 done += pfns;
216 credits_left -= pfns;
217 *last_pfn = (dest_pfn + pfns);
218 }
219 return done;
220}
124static unsigned long __init xen_set_identity_and_release( 221static unsigned long __init xen_set_identity_and_release(
125 const struct e820entry *list, size_t map_size, unsigned long nr_pages) 222 const struct e820entry *list, size_t map_size, unsigned long nr_pages)
126{ 223{
@@ -143,7 +240,6 @@ static unsigned long __init xen_set_identity_and_release(
143 */ 240 */
144 for (i = 0, entry = list; i < map_size; i++, entry++) { 241 for (i = 0, entry = list; i < map_size; i++, entry++) {
145 phys_addr_t end = entry->addr + entry->size; 242 phys_addr_t end = entry->addr + entry->size;
146
147 if (entry->type == E820_RAM || i == map_size - 1) { 243 if (entry->type == E820_RAM || i == map_size - 1) {
148 unsigned long start_pfn = PFN_DOWN(start); 244 unsigned long start_pfn = PFN_DOWN(start);
149 unsigned long end_pfn = PFN_UP(end); 245 unsigned long end_pfn = PFN_UP(end);
@@ -220,7 +316,9 @@ char * __init xen_memory_setup(void)
220 int rc; 316 int rc;
221 struct xen_memory_map memmap; 317 struct xen_memory_map memmap;
222 unsigned long max_pages; 318 unsigned long max_pages;
319 unsigned long last_pfn = 0;
223 unsigned long extra_pages = 0; 320 unsigned long extra_pages = 0;
321 unsigned long populated;
224 int i; 322 int i;
225 int op; 323 int op;
226 324
@@ -260,9 +358,20 @@ char * __init xen_memory_setup(void)
260 */ 358 */
261 xen_released_pages = xen_set_identity_and_release( 359 xen_released_pages = xen_set_identity_and_release(
262 map, memmap.nr_entries, max_pfn); 360 map, memmap.nr_entries, max_pfn);
263 extra_pages += xen_released_pages;
264 361
265 /* 362 /*
363 * Populate back the non-RAM pages and E820 gaps that had been
364 * released. */
365 populated = xen_populate_chunk(map, memmap.nr_entries,
366 max_pfn, &last_pfn, xen_released_pages);
367
368 extra_pages += (xen_released_pages - populated);
369
370 if (last_pfn > max_pfn) {
371 max_pfn = min(MAX_DOMAIN_PAGES, last_pfn);
372 mem_end = PFN_PHYS(max_pfn);
373 }
374 /*
266 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO 375 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
267 * factor the base size. On non-highmem systems, the base 376 * factor the base size. On non-highmem systems, the base
268 * size is the full initial memory allocation; on highmem it 377 * size is the full initial memory allocation; on highmem it
@@ -275,7 +384,6 @@ char * __init xen_memory_setup(void)
275 */ 384 */
276 extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), 385 extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
277 extra_pages); 386 extra_pages);
278
279 i = 0; 387 i = 0;
280 while (i < memmap.nr_entries) { 388 while (i < memmap.nr_entries) {
281 u64 addr = map[i].addr; 389 u64 addr = map[i].addr;