aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/init.c459
-rw-r--r--arch/x86/mm/init_32.c106
-rw-r--r--arch/x86/mm/init_64.c255
-rw-r--r--arch/x86/mm/mm_internal.h19
-rw-r--r--arch/x86/mm/numa.c32
-rw-r--r--arch/x86/mm/numa_32.c161
-rw-r--r--arch/x86/mm/numa_64.c13
-rw-r--r--arch/x86/mm/numa_internal.h6
-rw-r--r--arch/x86/mm/pageattr.c66
-rw-r--r--arch/x86/mm/pat.c4
-rw-r--r--arch/x86/mm/pgtable.c7
-rw-r--r--arch/x86/mm/physaddr.c60
12 files changed, 634 insertions, 554 deletions
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index d7aea41563b3..d41815265a0b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -17,86 +17,132 @@
17#include <asm/proto.h> 17#include <asm/proto.h>
18#include <asm/dma.h> /* for MAX_DMA_PFN */ 18#include <asm/dma.h> /* for MAX_DMA_PFN */
19 19
20unsigned long __initdata pgt_buf_start; 20#include "mm_internal.h"
21unsigned long __meminitdata pgt_buf_end;
22unsigned long __meminitdata pgt_buf_top;
23 21
24int after_bootmem; 22static unsigned long __initdata pgt_buf_start;
23static unsigned long __initdata pgt_buf_end;
24static unsigned long __initdata pgt_buf_top;
25 25
26int direct_gbpages 26static unsigned long min_pfn_mapped;
27#ifdef CONFIG_DIRECT_GBPAGES
28 = 1
29#endif
30;
31 27
32struct map_range { 28static bool __initdata can_use_brk_pgt = true;
33 unsigned long start;
34 unsigned long end;
35 unsigned page_size_mask;
36};
37 29
38/* 30/*
39 * First calculate space needed for kernel direct mapping page tables to cover 31 * Pages returned are already directly mapped.
40 * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB 32 *
41 * pages. Then find enough contiguous space for those page tables. 33 * Changing that is likely to break Xen, see commit:
34 *
35 * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
36 *
37 * for detailed information.
42 */ 38 */
43static void __init find_early_table_space(struct map_range *mr, int nr_range) 39__ref void *alloc_low_pages(unsigned int num)
44{ 40{
41 unsigned long pfn;
45 int i; 42 int i;
46 unsigned long puds = 0, pmds = 0, ptes = 0, tables;
47 unsigned long start = 0, good_end;
48 phys_addr_t base;
49 43
50 for (i = 0; i < nr_range; i++) { 44 if (after_bootmem) {
51 unsigned long range, extra; 45 unsigned int order;
52 46
53 range = mr[i].end - mr[i].start; 47 order = get_order((unsigned long)num << PAGE_SHIFT);
54 puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; 48 return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
49 __GFP_ZERO, order);
50 }
55 51
56 if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { 52 if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
57 extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); 53 unsigned long ret;
58 pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; 54 if (min_pfn_mapped >= max_pfn_mapped)
59 } else { 55 panic("alloc_low_page: ran out of memory");
60 pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; 56 ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
61 } 57 max_pfn_mapped << PAGE_SHIFT,
58 PAGE_SIZE * num , PAGE_SIZE);
59 if (!ret)
60 panic("alloc_low_page: can not alloc memory");
61 memblock_reserve(ret, PAGE_SIZE * num);
62 pfn = ret >> PAGE_SHIFT;
63 } else {
64 pfn = pgt_buf_end;
65 pgt_buf_end += num;
66 printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
67 pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
68 }
62 69
63 if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { 70 for (i = 0; i < num; i++) {
64 extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); 71 void *adr;
65#ifdef CONFIG_X86_32 72
66 extra += PMD_SIZE; 73 adr = __va((pfn + i) << PAGE_SHIFT);
67#endif 74 clear_page(adr);
68 ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
69 } else {
70 ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
71 }
72 } 75 }
73 76
74 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); 77 return __va(pfn << PAGE_SHIFT);
75 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); 78}
76 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
77 79
78#ifdef CONFIG_X86_32 80/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
79 /* for fixmap */ 81#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE)
80 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); 82RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
81#endif 83void __init early_alloc_pgt_buf(void)
82 good_end = max_pfn_mapped << PAGE_SHIFT; 84{
85 unsigned long tables = INIT_PGT_BUF_SIZE;
86 phys_addr_t base;
83 87
84 base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); 88 base = __pa(extend_brk(tables, PAGE_SIZE));
85 if (!base)
86 panic("Cannot find space for the kernel page tables");
87 89
88 pgt_buf_start = base >> PAGE_SHIFT; 90 pgt_buf_start = base >> PAGE_SHIFT;
89 pgt_buf_end = pgt_buf_start; 91 pgt_buf_end = pgt_buf_start;
90 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); 92 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
93}
94
95int after_bootmem;
96
97int direct_gbpages
98#ifdef CONFIG_DIRECT_GBPAGES
99 = 1
100#endif
101;
91 102
92 printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", 103static void __init init_gbpages(void)
93 mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, 104{
94 (pgt_buf_top << PAGE_SHIFT) - 1); 105#ifdef CONFIG_X86_64
106 if (direct_gbpages && cpu_has_gbpages)
107 printk(KERN_INFO "Using GB pages for direct mapping\n");
108 else
109 direct_gbpages = 0;
110#endif
95} 111}
96 112
97void __init native_pagetable_reserve(u64 start, u64 end) 113struct map_range {
114 unsigned long start;
115 unsigned long end;
116 unsigned page_size_mask;
117};
118
119static int page_size_mask;
120
121static void __init probe_page_size_mask(void)
98{ 122{
99 memblock_reserve(start, end - start); 123 init_gbpages();
124
125#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
126 /*
127 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
128 * This will simplify cpa(), which otherwise needs to support splitting
129 * large pages into small in interrupt context, etc.
130 */
131 if (direct_gbpages)
132 page_size_mask |= 1 << PG_LEVEL_1G;
133 if (cpu_has_pse)
134 page_size_mask |= 1 << PG_LEVEL_2M;
135#endif
136
137 /* Enable PSE if available */
138 if (cpu_has_pse)
139 set_in_cr4(X86_CR4_PSE);
140
141 /* Enable PGE if available */
142 if (cpu_has_pge) {
143 set_in_cr4(X86_CR4_PGE);
144 __supported_pte_mask |= _PAGE_GLOBAL;
145 }
100} 146}
101 147
102#ifdef CONFIG_X86_32 148#ifdef CONFIG_X86_32
@@ -122,58 +168,51 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,
122} 168}
123 169
124/* 170/*
125 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 171 * adjust the page_size_mask for small range to go with
126 * This runs before bootmem is initialized and gets pages directly from 172 * big page size instead small one if nearby are ram too.
127 * the physical memory. To access them they are temporarily mapped.
128 */ 173 */
129unsigned long __init_refok init_memory_mapping(unsigned long start, 174static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
130 unsigned long end) 175 int nr_range)
131{ 176{
132 unsigned long page_size_mask = 0; 177 int i;
133 unsigned long start_pfn, end_pfn;
134 unsigned long ret = 0;
135 unsigned long pos;
136
137 struct map_range mr[NR_RANGE_MR];
138 int nr_range, i;
139 int use_pse, use_gbpages;
140 178
141 printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n", 179 for (i = 0; i < nr_range; i++) {
142 start, end - 1); 180 if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
181 !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
182 unsigned long start = round_down(mr[i].start, PMD_SIZE);
183 unsigned long end = round_up(mr[i].end, PMD_SIZE);
143 184
144#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) 185#ifdef CONFIG_X86_32
145 /* 186 if ((end >> PAGE_SHIFT) > max_low_pfn)
146 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. 187 continue;
147 * This will simplify cpa(), which otherwise needs to support splitting
148 * large pages into small in interrupt context, etc.
149 */
150 use_pse = use_gbpages = 0;
151#else
152 use_pse = cpu_has_pse;
153 use_gbpages = direct_gbpages;
154#endif 188#endif
155 189
156 /* Enable PSE if available */ 190 if (memblock_is_region_memory(start, end - start))
157 if (cpu_has_pse) 191 mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
158 set_in_cr4(X86_CR4_PSE); 192 }
193 if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
194 !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
195 unsigned long start = round_down(mr[i].start, PUD_SIZE);
196 unsigned long end = round_up(mr[i].end, PUD_SIZE);
159 197
160 /* Enable PGE if available */ 198 if (memblock_is_region_memory(start, end - start))
161 if (cpu_has_pge) { 199 mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
162 set_in_cr4(X86_CR4_PGE); 200 }
163 __supported_pte_mask |= _PAGE_GLOBAL;
164 } 201 }
202}
165 203
166 if (use_gbpages) 204static int __meminit split_mem_range(struct map_range *mr, int nr_range,
167 page_size_mask |= 1 << PG_LEVEL_1G; 205 unsigned long start,
168 if (use_pse) 206 unsigned long end)
169 page_size_mask |= 1 << PG_LEVEL_2M; 207{
208 unsigned long start_pfn, end_pfn, limit_pfn;
209 unsigned long pfn;
210 int i;
170 211
171 memset(mr, 0, sizeof(mr)); 212 limit_pfn = PFN_DOWN(end);
172 nr_range = 0;
173 213
174 /* head if not big page alignment ? */ 214 /* head if not big page alignment ? */
175 start_pfn = start >> PAGE_SHIFT; 215 pfn = start_pfn = PFN_DOWN(start);
176 pos = start_pfn << PAGE_SHIFT;
177#ifdef CONFIG_X86_32 216#ifdef CONFIG_X86_32
178 /* 217 /*
179 * Don't use a large page for the first 2/4MB of memory 218 * Don't use a large page for the first 2/4MB of memory
@@ -181,66 +220,60 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
181 * and overlapping MTRRs into large pages can cause 220 * and overlapping MTRRs into large pages can cause
182 * slowdowns. 221 * slowdowns.
183 */ 222 */
184 if (pos == 0) 223 if (pfn == 0)
185 end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); 224 end_pfn = PFN_DOWN(PMD_SIZE);
186 else 225 else
187 end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) 226 end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
188 << (PMD_SHIFT - PAGE_SHIFT);
189#else /* CONFIG_X86_64 */ 227#else /* CONFIG_X86_64 */
190 end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) 228 end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
191 << (PMD_SHIFT - PAGE_SHIFT);
192#endif 229#endif
193 if (end_pfn > (end >> PAGE_SHIFT)) 230 if (end_pfn > limit_pfn)
194 end_pfn = end >> PAGE_SHIFT; 231 end_pfn = limit_pfn;
195 if (start_pfn < end_pfn) { 232 if (start_pfn < end_pfn) {
196 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); 233 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
197 pos = end_pfn << PAGE_SHIFT; 234 pfn = end_pfn;
198 } 235 }
199 236
200 /* big page (2M) range */ 237 /* big page (2M) range */
201 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) 238 start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
202 << (PMD_SHIFT - PAGE_SHIFT);
203#ifdef CONFIG_X86_32 239#ifdef CONFIG_X86_32
204 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); 240 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
205#else /* CONFIG_X86_64 */ 241#else /* CONFIG_X86_64 */
206 end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) 242 end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
207 << (PUD_SHIFT - PAGE_SHIFT); 243 if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
208 if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) 244 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
209 end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
210#endif 245#endif
211 246
212 if (start_pfn < end_pfn) { 247 if (start_pfn < end_pfn) {
213 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 248 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
214 page_size_mask & (1<<PG_LEVEL_2M)); 249 page_size_mask & (1<<PG_LEVEL_2M));
215 pos = end_pfn << PAGE_SHIFT; 250 pfn = end_pfn;
216 } 251 }
217 252
218#ifdef CONFIG_X86_64 253#ifdef CONFIG_X86_64
219 /* big page (1G) range */ 254 /* big page (1G) range */
220 start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) 255 start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
221 << (PUD_SHIFT - PAGE_SHIFT); 256 end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
222 end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
223 if (start_pfn < end_pfn) { 257 if (start_pfn < end_pfn) {
224 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 258 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
225 page_size_mask & 259 page_size_mask &
226 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); 260 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
227 pos = end_pfn << PAGE_SHIFT; 261 pfn = end_pfn;
228 } 262 }
229 263
230 /* tail is not big page (1G) alignment */ 264 /* tail is not big page (1G) alignment */
231 start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) 265 start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
232 << (PMD_SHIFT - PAGE_SHIFT); 266 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
233 end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
234 if (start_pfn < end_pfn) { 267 if (start_pfn < end_pfn) {
235 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 268 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
236 page_size_mask & (1<<PG_LEVEL_2M)); 269 page_size_mask & (1<<PG_LEVEL_2M));
237 pos = end_pfn << PAGE_SHIFT; 270 pfn = end_pfn;
238 } 271 }
239#endif 272#endif
240 273
241 /* tail is not big page (2M) alignment */ 274 /* tail is not big page (2M) alignment */
242 start_pfn = pos>>PAGE_SHIFT; 275 start_pfn = pfn;
243 end_pfn = end>>PAGE_SHIFT; 276 end_pfn = limit_pfn;
244 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); 277 nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
245 278
246 /* try to merge same page size and continuous */ 279 /* try to merge same page size and continuous */
@@ -257,59 +290,169 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
257 nr_range--; 290 nr_range--;
258 } 291 }
259 292
293 if (!after_bootmem)
294 adjust_range_page_size_mask(mr, nr_range);
295
260 for (i = 0; i < nr_range; i++) 296 for (i = 0; i < nr_range; i++)
261 printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", 297 printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
262 mr[i].start, mr[i].end - 1, 298 mr[i].start, mr[i].end - 1,
263 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( 299 (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
264 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); 300 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
265 301
266 /* 302 return nr_range;
267 * Find space for the kernel direct mapping tables. 303}
268 * 304
269 * Later we should allocate these tables in the local node of the 305struct range pfn_mapped[E820_X_MAX];
270 * memory mapped. Unfortunately this is done currently before the 306int nr_pfn_mapped;
271 * nodes are discovered. 307
272 */ 308static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
273 if (!after_bootmem) 309{
274 find_early_table_space(mr, nr_range); 310 nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
311 nr_pfn_mapped, start_pfn, end_pfn);
312 nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
313
314 max_pfn_mapped = max(max_pfn_mapped, end_pfn);
315
316 if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
317 max_low_pfn_mapped = max(max_low_pfn_mapped,
318 min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
319}
320
321bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
322{
323 int i;
324
325 for (i = 0; i < nr_pfn_mapped; i++)
326 if ((start_pfn >= pfn_mapped[i].start) &&
327 (end_pfn <= pfn_mapped[i].end))
328 return true;
329
330 return false;
331}
332
333/*
334 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
335 * This runs before bootmem is initialized and gets pages directly from
336 * the physical memory. To access them they are temporarily mapped.
337 */
338unsigned long __init_refok init_memory_mapping(unsigned long start,
339 unsigned long end)
340{
341 struct map_range mr[NR_RANGE_MR];
342 unsigned long ret = 0;
343 int nr_range, i;
344
345 pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
346 start, end - 1);
347
348 memset(mr, 0, sizeof(mr));
349 nr_range = split_mem_range(mr, 0, start, end);
275 350
276 for (i = 0; i < nr_range; i++) 351 for (i = 0; i < nr_range; i++)
277 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, 352 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
278 mr[i].page_size_mask); 353 mr[i].page_size_mask);
279 354
280#ifdef CONFIG_X86_32 355 add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
281 early_ioremap_page_table_range_init();
282 356
283 load_cr3(swapper_pg_dir); 357 return ret >> PAGE_SHIFT;
284#endif 358}
285 359
286 __flush_tlb_all(); 360/*
361 * would have hole in the middle or ends, and only ram parts will be mapped.
362 */
363static unsigned long __init init_range_memory_mapping(
364 unsigned long r_start,
365 unsigned long r_end)
366{
367 unsigned long start_pfn, end_pfn;
368 unsigned long mapped_ram_size = 0;
369 int i;
287 370
288 /* 371 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
289 * Reserve the kernel pagetable pages we used (pgt_buf_start - 372 u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
290 * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) 373 u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
291 * so that they can be reused for other purposes. 374 if (start >= end)
292 * 375 continue;
293 * On native it just means calling memblock_reserve, on Xen it also
294 * means marking RW the pagetable pages that we allocated before
295 * but that haven't been used.
296 *
297 * In fact on xen we mark RO the whole range pgt_buf_start -
298 * pgt_buf_top, because we have to make sure that when
299 * init_memory_mapping reaches the pagetable pages area, it maps
300 * RO all the pagetable pages, including the ones that are beyond
301 * pgt_buf_end at that time.
302 */
303 if (!after_bootmem && pgt_buf_end > pgt_buf_start)
304 x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
305 PFN_PHYS(pgt_buf_end));
306 376
307 if (!after_bootmem) 377 /*
308 early_memtest(start, end); 378 * if it is overlapping with brk pgt, we need to
379 * alloc pgt buf from memblock instead.
380 */
381 can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
382 min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
383 init_memory_mapping(start, end);
384 mapped_ram_size += end - start;
385 can_use_brk_pgt = true;
386 }
309 387
310 return ret >> PAGE_SHIFT; 388 return mapped_ram_size;
311} 389}
312 390
391/* (PUD_SHIFT-PMD_SHIFT)/2 */
392#define STEP_SIZE_SHIFT 5
393void __init init_mem_mapping(void)
394{
395 unsigned long end, real_end, start, last_start;
396 unsigned long step_size;
397 unsigned long addr;
398 unsigned long mapped_ram_size = 0;
399 unsigned long new_mapped_ram_size;
400
401 probe_page_size_mask();
402
403#ifdef CONFIG_X86_64
404 end = max_pfn << PAGE_SHIFT;
405#else
406 end = max_low_pfn << PAGE_SHIFT;
407#endif
408
409 /* the ISA range is always mapped regardless of memory holes */
410 init_memory_mapping(0, ISA_END_ADDRESS);
411
412 /* xen has big range in reserved near end of ram, skip it at first */
413 addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE,
414 PAGE_SIZE);
415 real_end = addr + PMD_SIZE;
416
417 /* step_size need to be small so pgt_buf from BRK could cover it */
418 step_size = PMD_SIZE;
419 max_pfn_mapped = 0; /* will get exact value next */
420 min_pfn_mapped = real_end >> PAGE_SHIFT;
421 last_start = start = real_end;
422 while (last_start > ISA_END_ADDRESS) {
423 if (last_start > step_size) {
424 start = round_down(last_start - 1, step_size);
425 if (start < ISA_END_ADDRESS)
426 start = ISA_END_ADDRESS;
427 } else
428 start = ISA_END_ADDRESS;
429 new_mapped_ram_size = init_range_memory_mapping(start,
430 last_start);
431 last_start = start;
432 min_pfn_mapped = last_start >> PAGE_SHIFT;
433 /* only increase step_size after big range get mapped */
434 if (new_mapped_ram_size > mapped_ram_size)
435 step_size <<= STEP_SIZE_SHIFT;
436 mapped_ram_size += new_mapped_ram_size;
437 }
438
439 if (real_end < end)
440 init_range_memory_mapping(real_end, end);
441
442#ifdef CONFIG_X86_64
443 if (max_pfn > max_low_pfn) {
444 /* can we preseve max_low_pfn ?*/
445 max_low_pfn = max_pfn;
446 }
447#else
448 early_ioremap_page_table_range_init();
449#endif
450
451 load_cr3(swapper_pg_dir);
452 __flush_tlb_all();
453
454 early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
455}
313 456
314/* 457/*
315 * devmem_is_allowed() checks to see if /dev/mem access to a certain address 458 * devmem_is_allowed() checks to see if /dev/mem access to a certain address
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 745d66b843c8..b299724f6e34 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -53,25 +53,14 @@
53#include <asm/page_types.h> 53#include <asm/page_types.h>
54#include <asm/init.h> 54#include <asm/init.h>
55 55
56#include "mm_internal.h"
57
56unsigned long highstart_pfn, highend_pfn; 58unsigned long highstart_pfn, highend_pfn;
57 59
58static noinline int do_test_wp_bit(void); 60static noinline int do_test_wp_bit(void);
59 61
60bool __read_mostly __vmalloc_start_set = false; 62bool __read_mostly __vmalloc_start_set = false;
61 63
62static __init void *alloc_low_page(void)
63{
64 unsigned long pfn = pgt_buf_end++;
65 void *adr;
66
67 if (pfn >= pgt_buf_top)
68 panic("alloc_low_page: ran out of memory");
69
70 adr = __va(pfn * PAGE_SIZE);
71 clear_page(adr);
72 return adr;
73}
74
75/* 64/*
76 * Creates a middle page table and puts a pointer to it in the 65 * Creates a middle page table and puts a pointer to it in the
77 * given global directory entry. This only returns the gd entry 66 * given global directory entry. This only returns the gd entry
@@ -84,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
84 73
85#ifdef CONFIG_X86_PAE 74#ifdef CONFIG_X86_PAE
86 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { 75 if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
87 if (after_bootmem) 76 pmd_table = (pmd_t *)alloc_low_page();
88 pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
89 else
90 pmd_table = (pmd_t *)alloc_low_page();
91 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); 77 paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
92 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 78 set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
93 pud = pud_offset(pgd, 0); 79 pud = pud_offset(pgd, 0);
@@ -109,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
109static pte_t * __init one_page_table_init(pmd_t *pmd) 95static pte_t * __init one_page_table_init(pmd_t *pmd)
110{ 96{
111 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { 97 if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
112 pte_t *page_table = NULL; 98 pte_t *page_table = (pte_t *)alloc_low_page();
113
114 if (after_bootmem) {
115#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
116 page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
117#endif
118 if (!page_table)
119 page_table =
120 (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
121 } else
122 page_table = (pte_t *)alloc_low_page();
123 99
124 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); 100 paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
125 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); 101 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -146,8 +122,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr)
146 return one_page_table_init(pmd) + pte_idx; 122 return one_page_table_init(pmd) + pte_idx;
147} 123}
148 124
125static unsigned long __init
126page_table_range_init_count(unsigned long start, unsigned long end)
127{
128 unsigned long count = 0;
129#ifdef CONFIG_HIGHMEM
130 int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
131 int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
132 int pgd_idx, pmd_idx;
133 unsigned long vaddr;
134
135 if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
136 return 0;
137
138 vaddr = start;
139 pgd_idx = pgd_index(vaddr);
140
141 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
142 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
143 pmd_idx++) {
144 if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
145 (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
146 count++;
147 vaddr += PMD_SIZE;
148 }
149 pmd_idx = 0;
150 }
151#endif
152 return count;
153}
154
149static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, 155static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
150 unsigned long vaddr, pte_t *lastpte) 156 unsigned long vaddr, pte_t *lastpte,
157 void **adr)
151{ 158{
152#ifdef CONFIG_HIGHMEM 159#ifdef CONFIG_HIGHMEM
153 /* 160 /*
@@ -161,16 +168,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
161 168
162 if (pmd_idx_kmap_begin != pmd_idx_kmap_end 169 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
163 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin 170 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
164 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end 171 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
165 && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
166 || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
167 pte_t *newpte; 172 pte_t *newpte;
168 int i; 173 int i;
169 174
170 BUG_ON(after_bootmem); 175 BUG_ON(after_bootmem);
171 newpte = alloc_low_page(); 176 newpte = *adr;
172 for (i = 0; i < PTRS_PER_PTE; i++) 177 for (i = 0; i < PTRS_PER_PTE; i++)
173 set_pte(newpte + i, pte[i]); 178 set_pte(newpte + i, pte[i]);
179 *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
174 180
175 paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT); 181 paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
176 set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); 182 set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
@@ -204,6 +210,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
204 pgd_t *pgd; 210 pgd_t *pgd;
205 pmd_t *pmd; 211 pmd_t *pmd;
206 pte_t *pte = NULL; 212 pte_t *pte = NULL;
213 unsigned long count = page_table_range_init_count(start, end);
214 void *adr = NULL;
215
216 if (count)
217 adr = alloc_low_pages(count);
207 218
208 vaddr = start; 219 vaddr = start;
209 pgd_idx = pgd_index(vaddr); 220 pgd_idx = pgd_index(vaddr);
@@ -216,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
216 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); 227 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
217 pmd++, pmd_idx++) { 228 pmd++, pmd_idx++) {
218 pte = page_table_kmap_check(one_page_table_init(pmd), 229 pte = page_table_kmap_check(one_page_table_init(pmd),
219 pmd, vaddr, pte); 230 pmd, vaddr, pte, &adr);
220 231
221 vaddr += PMD_SIZE; 232 vaddr += PMD_SIZE;
222 } 233 }
@@ -310,6 +321,7 @@ repeat:
310 __pgprot(PTE_IDENT_ATTR | 321 __pgprot(PTE_IDENT_ATTR |
311 _PAGE_PSE); 322 _PAGE_PSE);
312 323
324 pfn &= PMD_MASK >> PAGE_SHIFT;
313 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + 325 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
314 PAGE_OFFSET + PAGE_SIZE-1; 326 PAGE_OFFSET + PAGE_SIZE-1;
315 327
@@ -455,9 +467,14 @@ void __init native_pagetable_init(void)
455 467
456 /* 468 /*
457 * Remove any mappings which extend past the end of physical 469 * Remove any mappings which extend past the end of physical
458 * memory from the boot time page table: 470 * memory from the boot time page table.
471 * In virtual address space, we should have at least two pages
472 * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END
473 * definition. And max_low_pfn is set to VMALLOC_END physical
474 * address. If initial memory mapping is doing right job, we
475 * should have pte used near max_low_pfn or one pmd is not present.
459 */ 476 */
460 for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) { 477 for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
461 va = PAGE_OFFSET + (pfn<<PAGE_SHIFT); 478 va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
462 pgd = base + pgd_index(va); 479 pgd = base + pgd_index(va);
463 if (!pgd_present(*pgd)) 480 if (!pgd_present(*pgd))
@@ -468,10 +485,19 @@ void __init native_pagetable_init(void)
468 if (!pmd_present(*pmd)) 485 if (!pmd_present(*pmd))
469 break; 486 break;
470 487
488 /* should not be large page here */
489 if (pmd_large(*pmd)) {
490 pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n",
491 pfn, pmd, __pa(pmd));
492 BUG_ON(1);
493 }
494
471 pte = pte_offset_kernel(pmd, va); 495 pte = pte_offset_kernel(pmd, va);
472 if (!pte_present(*pte)) 496 if (!pte_present(*pte))
473 break; 497 break;
474 498
499 printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n",
500 pfn, pmd, __pa(pmd), pte, __pa(pte));
475 pte_clear(NULL, va, pte); 501 pte_clear(NULL, va, pte);
476 } 502 }
477 paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); 503 paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
@@ -550,7 +576,7 @@ early_param("highmem", parse_highmem);
550 * artificially via the highmem=x boot parameter then create 576 * artificially via the highmem=x boot parameter then create
551 * it: 577 * it:
552 */ 578 */
553void __init lowmem_pfn_init(void) 579static void __init lowmem_pfn_init(void)
554{ 580{
555 /* max_low_pfn is 0, we already have early_res support */ 581 /* max_low_pfn is 0, we already have early_res support */
556 max_low_pfn = max_pfn; 582 max_low_pfn = max_pfn;
@@ -586,7 +612,7 @@ void __init lowmem_pfn_init(void)
586 * We have more RAM than fits into lowmem - we try to put it into 612 * We have more RAM than fits into lowmem - we try to put it into
587 * highmem, also taking the highmem=x boot parameter into account: 613 * highmem, also taking the highmem=x boot parameter into account:
588 */ 614 */
589void __init highmem_pfn_init(void) 615static void __init highmem_pfn_init(void)
590{ 616{
591 max_low_pfn = MAXMEM_PFN; 617 max_low_pfn = MAXMEM_PFN;
592 618
@@ -669,8 +695,6 @@ void __init setup_bootmem_allocator(void)
669 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 695 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
670 max_pfn_mapped<<PAGE_SHIFT); 696 max_pfn_mapped<<PAGE_SHIFT);
671 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); 697 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
672
673 after_bootmem = 1;
674} 698}
675 699
676/* 700/*
@@ -753,6 +777,8 @@ void __init mem_init(void)
753 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) 777 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
754 reservedpages++; 778 reservedpages++;
755 779
780 after_bootmem = 1;
781
756 codesize = (unsigned long) &_etext - (unsigned long) &_text; 782 codesize = (unsigned long) &_etext - (unsigned long) &_text;
757 datasize = (unsigned long) &_edata - (unsigned long) &_etext; 783 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
758 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; 784 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d6eeead43758..3eba7f429880 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -54,6 +54,82 @@
54#include <asm/uv/uv.h> 54#include <asm/uv/uv.h>
55#include <asm/setup.h> 55#include <asm/setup.h>
56 56
57#include "mm_internal.h"
58
59static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
60 unsigned long addr, unsigned long end)
61{
62 addr &= PMD_MASK;
63 for (; addr < end; addr += PMD_SIZE) {
64 pmd_t *pmd = pmd_page + pmd_index(addr);
65
66 if (!pmd_present(*pmd))
67 set_pmd(pmd, __pmd(addr | pmd_flag));
68 }
69}
70static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
71 unsigned long addr, unsigned long end)
72{
73 unsigned long next;
74
75 for (; addr < end; addr = next) {
76 pud_t *pud = pud_page + pud_index(addr);
77 pmd_t *pmd;
78
79 next = (addr & PUD_MASK) + PUD_SIZE;
80 if (next > end)
81 next = end;
82
83 if (pud_present(*pud)) {
84 pmd = pmd_offset(pud, 0);
85 ident_pmd_init(info->pmd_flag, pmd, addr, next);
86 continue;
87 }
88 pmd = (pmd_t *)info->alloc_pgt_page(info->context);
89 if (!pmd)
90 return -ENOMEM;
91 ident_pmd_init(info->pmd_flag, pmd, addr, next);
92 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
93 }
94
95 return 0;
96}
97
98int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
99 unsigned long addr, unsigned long end)
100{
101 unsigned long next;
102 int result;
103 int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
104
105 for (; addr < end; addr = next) {
106 pgd_t *pgd = pgd_page + pgd_index(addr) + off;
107 pud_t *pud;
108
109 next = (addr & PGDIR_MASK) + PGDIR_SIZE;
110 if (next > end)
111 next = end;
112
113 if (pgd_present(*pgd)) {
114 pud = pud_offset(pgd, 0);
115 result = ident_pud_init(info, pud, addr, next);
116 if (result)
117 return result;
118 continue;
119 }
120
121 pud = (pud_t *)info->alloc_pgt_page(info->context);
122 if (!pud)
123 return -ENOMEM;
124 result = ident_pud_init(info, pud, addr, next);
125 if (result)
126 return result;
127 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
128 }
129
130 return 0;
131}
132
57static int __init parse_direct_gbpages_off(char *arg) 133static int __init parse_direct_gbpages_off(char *arg)
58{ 134{
59 direct_gbpages = 0; 135 direct_gbpages = 0;
@@ -302,10 +378,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
302void __init cleanup_highmap(void) 378void __init cleanup_highmap(void)
303{ 379{
304 unsigned long vaddr = __START_KERNEL_map; 380 unsigned long vaddr = __START_KERNEL_map;
305 unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); 381 unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
306 unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; 382 unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
307 pmd_t *pmd = level2_kernel_pgt; 383 pmd_t *pmd = level2_kernel_pgt;
308 384
385 /*
386 * Native path, max_pfn_mapped is not set yet.
387 * Xen has valid max_pfn_mapped set in
388 * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
389 */
390 if (max_pfn_mapped)
391 vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
392
309 for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { 393 for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
310 if (pmd_none(*pmd)) 394 if (pmd_none(*pmd))
311 continue; 395 continue;
@@ -314,69 +398,24 @@ void __init cleanup_highmap(void)
314 } 398 }
315} 399}
316 400
317static __ref void *alloc_low_page(unsigned long *phys)
318{
319 unsigned long pfn = pgt_buf_end++;
320 void *adr;
321
322 if (after_bootmem) {
323 adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
324 *phys = __pa(adr);
325
326 return adr;
327 }
328
329 if (pfn >= pgt_buf_top)
330 panic("alloc_low_page: ran out of memory");
331
332 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
333 clear_page(adr);
334 *phys = pfn * PAGE_SIZE;
335 return adr;
336}
337
338static __ref void *map_low_page(void *virt)
339{
340 void *adr;
341 unsigned long phys, left;
342
343 if (after_bootmem)
344 return virt;
345
346 phys = __pa(virt);
347 left = phys & (PAGE_SIZE - 1);
348 adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
349 adr = (void *)(((unsigned long)adr) | left);
350
351 return adr;
352}
353
354static __ref void unmap_low_page(void *adr)
355{
356 if (after_bootmem)
357 return;
358
359 early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
360}
361
362static unsigned long __meminit 401static unsigned long __meminit
363phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, 402phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
364 pgprot_t prot) 403 pgprot_t prot)
365{ 404{
366 unsigned pages = 0; 405 unsigned long pages = 0, next;
367 unsigned long last_map_addr = end; 406 unsigned long last_map_addr = end;
368 int i; 407 int i;
369 408
370 pte_t *pte = pte_page + pte_index(addr); 409 pte_t *pte = pte_page + pte_index(addr);
371 410
372 for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { 411 for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
373 412 next = (addr & PAGE_MASK) + PAGE_SIZE;
374 if (addr >= end) { 413 if (addr >= end) {
375 if (!after_bootmem) { 414 if (!after_bootmem &&
376 for(; i < PTRS_PER_PTE; i++, pte++) 415 !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) &&
377 set_pte(pte, __pte(0)); 416 !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN))
378 } 417 set_pte(pte, __pte(0));
379 break; 418 continue;
380 } 419 }
381 420
382 /* 421 /*
@@ -414,28 +453,25 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
414 int i = pmd_index(address); 453 int i = pmd_index(address);
415 454
416 for (; i < PTRS_PER_PMD; i++, address = next) { 455 for (; i < PTRS_PER_PMD; i++, address = next) {
417 unsigned long pte_phys;
418 pmd_t *pmd = pmd_page + pmd_index(address); 456 pmd_t *pmd = pmd_page + pmd_index(address);
419 pte_t *pte; 457 pte_t *pte;
420 pgprot_t new_prot = prot; 458 pgprot_t new_prot = prot;
421 459
460 next = (address & PMD_MASK) + PMD_SIZE;
422 if (address >= end) { 461 if (address >= end) {
423 if (!after_bootmem) { 462 if (!after_bootmem &&
424 for (; i < PTRS_PER_PMD; i++, pmd++) 463 !e820_any_mapped(address & PMD_MASK, next, E820_RAM) &&
425 set_pmd(pmd, __pmd(0)); 464 !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN))
426 } 465 set_pmd(pmd, __pmd(0));
427 break; 466 continue;
428 } 467 }
429 468
430 next = (address & PMD_MASK) + PMD_SIZE;
431
432 if (pmd_val(*pmd)) { 469 if (pmd_val(*pmd)) {
433 if (!pmd_large(*pmd)) { 470 if (!pmd_large(*pmd)) {
434 spin_lock(&init_mm.page_table_lock); 471 spin_lock(&init_mm.page_table_lock);
435 pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd)); 472 pte = (pte_t *)pmd_page_vaddr(*pmd);
436 last_map_addr = phys_pte_init(pte, address, 473 last_map_addr = phys_pte_init(pte, address,
437 end, prot); 474 end, prot);
438 unmap_low_page(pte);
439 spin_unlock(&init_mm.page_table_lock); 475 spin_unlock(&init_mm.page_table_lock);
440 continue; 476 continue;
441 } 477 }
@@ -464,19 +500,18 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
464 pages++; 500 pages++;
465 spin_lock(&init_mm.page_table_lock); 501 spin_lock(&init_mm.page_table_lock);
466 set_pte((pte_t *)pmd, 502 set_pte((pte_t *)pmd,
467 pfn_pte(address >> PAGE_SHIFT, 503 pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
468 __pgprot(pgprot_val(prot) | _PAGE_PSE))); 504 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
469 spin_unlock(&init_mm.page_table_lock); 505 spin_unlock(&init_mm.page_table_lock);
470 last_map_addr = next; 506 last_map_addr = next;
471 continue; 507 continue;
472 } 508 }
473 509
474 pte = alloc_low_page(&pte_phys); 510 pte = alloc_low_page();
475 last_map_addr = phys_pte_init(pte, address, end, new_prot); 511 last_map_addr = phys_pte_init(pte, address, end, new_prot);
476 unmap_low_page(pte);
477 512
478 spin_lock(&init_mm.page_table_lock); 513 spin_lock(&init_mm.page_table_lock);
479 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); 514 pmd_populate_kernel(&init_mm, pmd, pte);
480 spin_unlock(&init_mm.page_table_lock); 515 spin_unlock(&init_mm.page_table_lock);
481 } 516 }
482 update_page_count(PG_LEVEL_2M, pages); 517 update_page_count(PG_LEVEL_2M, pages);
@@ -492,27 +527,24 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
492 int i = pud_index(addr); 527 int i = pud_index(addr);
493 528
494 for (; i < PTRS_PER_PUD; i++, addr = next) { 529 for (; i < PTRS_PER_PUD; i++, addr = next) {
495 unsigned long pmd_phys;
496 pud_t *pud = pud_page + pud_index(addr); 530 pud_t *pud = pud_page + pud_index(addr);
497 pmd_t *pmd; 531 pmd_t *pmd;
498 pgprot_t prot = PAGE_KERNEL; 532 pgprot_t prot = PAGE_KERNEL;
499 533
500 if (addr >= end)
501 break;
502
503 next = (addr & PUD_MASK) + PUD_SIZE; 534 next = (addr & PUD_MASK) + PUD_SIZE;
504 535 if (addr >= end) {
505 if (!after_bootmem && !e820_any_mapped(addr, next, 0)) { 536 if (!after_bootmem &&
506 set_pud(pud, __pud(0)); 537 !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
538 !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN))
539 set_pud(pud, __pud(0));
507 continue; 540 continue;
508 } 541 }
509 542
510 if (pud_val(*pud)) { 543 if (pud_val(*pud)) {
511 if (!pud_large(*pud)) { 544 if (!pud_large(*pud)) {
512 pmd = map_low_page(pmd_offset(pud, 0)); 545 pmd = pmd_offset(pud, 0);
513 last_map_addr = phys_pmd_init(pmd, addr, end, 546 last_map_addr = phys_pmd_init(pmd, addr, end,
514 page_size_mask, prot); 547 page_size_mask, prot);
515 unmap_low_page(pmd);
516 __flush_tlb_all(); 548 __flush_tlb_all();
517 continue; 549 continue;
518 } 550 }
@@ -541,19 +573,19 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
541 pages++; 573 pages++;
542 spin_lock(&init_mm.page_table_lock); 574 spin_lock(&init_mm.page_table_lock);
543 set_pte((pte_t *)pud, 575 set_pte((pte_t *)pud,
544 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 576 pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
577 PAGE_KERNEL_LARGE));
545 spin_unlock(&init_mm.page_table_lock); 578 spin_unlock(&init_mm.page_table_lock);
546 last_map_addr = next; 579 last_map_addr = next;
547 continue; 580 continue;
548 } 581 }
549 582
550 pmd = alloc_low_page(&pmd_phys); 583 pmd = alloc_low_page();
551 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, 584 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
552 prot); 585 prot);
553 unmap_low_page(pmd);
554 586
555 spin_lock(&init_mm.page_table_lock); 587 spin_lock(&init_mm.page_table_lock);
556 pud_populate(&init_mm, pud, __va(pmd_phys)); 588 pud_populate(&init_mm, pud, pmd);
557 spin_unlock(&init_mm.page_table_lock); 589 spin_unlock(&init_mm.page_table_lock);
558 } 590 }
559 __flush_tlb_all(); 591 __flush_tlb_all();
@@ -578,28 +610,23 @@ kernel_physical_mapping_init(unsigned long start,
578 610
579 for (; start < end; start = next) { 611 for (; start < end; start = next) {
580 pgd_t *pgd = pgd_offset_k(start); 612 pgd_t *pgd = pgd_offset_k(start);
581 unsigned long pud_phys;
582 pud_t *pud; 613 pud_t *pud;
583 614
584 next = (start + PGDIR_SIZE) & PGDIR_MASK; 615 next = (start & PGDIR_MASK) + PGDIR_SIZE;
585 if (next > end)
586 next = end;
587 616
588 if (pgd_val(*pgd)) { 617 if (pgd_val(*pgd)) {
589 pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd)); 618 pud = (pud_t *)pgd_page_vaddr(*pgd);
590 last_map_addr = phys_pud_init(pud, __pa(start), 619 last_map_addr = phys_pud_init(pud, __pa(start),
591 __pa(end), page_size_mask); 620 __pa(end), page_size_mask);
592 unmap_low_page(pud);
593 continue; 621 continue;
594 } 622 }
595 623
596 pud = alloc_low_page(&pud_phys); 624 pud = alloc_low_page();
597 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), 625 last_map_addr = phys_pud_init(pud, __pa(start), __pa(end),
598 page_size_mask); 626 page_size_mask);
599 unmap_low_page(pud);
600 627
601 spin_lock(&init_mm.page_table_lock); 628 spin_lock(&init_mm.page_table_lock);
602 pgd_populate(&init_mm, pgd, __va(pud_phys)); 629 pgd_populate(&init_mm, pgd, pud);
603 spin_unlock(&init_mm.page_table_lock); 630 spin_unlock(&init_mm.page_table_lock);
604 pgd_changed = true; 631 pgd_changed = true;
605 } 632 }
@@ -664,13 +691,11 @@ int arch_add_memory(int nid, u64 start, u64 size)
664{ 691{
665 struct pglist_data *pgdat = NODE_DATA(nid); 692 struct pglist_data *pgdat = NODE_DATA(nid);
666 struct zone *zone = pgdat->node_zones + ZONE_NORMAL; 693 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
667 unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; 694 unsigned long start_pfn = start >> PAGE_SHIFT;
668 unsigned long nr_pages = size >> PAGE_SHIFT; 695 unsigned long nr_pages = size >> PAGE_SHIFT;
669 int ret; 696 int ret;
670 697
671 last_mapped_pfn = init_memory_mapping(start, start + size); 698 init_memory_mapping(start, start + size);
672 if (last_mapped_pfn > max_pfn_mapped)
673 max_pfn_mapped = last_mapped_pfn;
674 699
675 ret = __add_pages(nid, zone, start_pfn, nr_pages); 700 ret = __add_pages(nid, zone, start_pfn, nr_pages);
676 WARN_ON_ONCE(ret); 701 WARN_ON_ONCE(ret);
@@ -686,6 +711,16 @@ EXPORT_SYMBOL_GPL(arch_add_memory);
686 711
687static struct kcore_list kcore_vsyscall; 712static struct kcore_list kcore_vsyscall;
688 713
714static void __init register_page_bootmem_info(void)
715{
716#ifdef CONFIG_NUMA
717 int i;
718
719 for_each_online_node(i)
720 register_page_bootmem_info_node(NODE_DATA(i));
721#endif
722}
723
689void __init mem_init(void) 724void __init mem_init(void)
690{ 725{
691 long codesize, reservedpages, datasize, initsize; 726 long codesize, reservedpages, datasize, initsize;
@@ -698,11 +733,8 @@ void __init mem_init(void)
698 reservedpages = 0; 733 reservedpages = 0;
699 734
700 /* this will put all low memory onto the freelists */ 735 /* this will put all low memory onto the freelists */
701#ifdef CONFIG_NUMA 736 register_page_bootmem_info();
702 totalram_pages = numa_free_all_bootmem();
703#else
704 totalram_pages = free_all_bootmem(); 737 totalram_pages = free_all_bootmem();
705#endif
706 738
707 absent_pages = absent_pages_in_range(0, max_pfn); 739 absent_pages = absent_pages_in_range(0, max_pfn);
708 reservedpages = max_pfn - totalram_pages - absent_pages; 740 reservedpages = max_pfn - totalram_pages - absent_pages;
@@ -772,12 +804,11 @@ void set_kernel_text_ro(void)
772void mark_rodata_ro(void) 804void mark_rodata_ro(void)
773{ 805{
774 unsigned long start = PFN_ALIGN(_text); 806 unsigned long start = PFN_ALIGN(_text);
775 unsigned long rodata_start = 807 unsigned long rodata_start = PFN_ALIGN(__start_rodata);
776 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
777 unsigned long end = (unsigned long) &__end_rodata_hpage_align; 808 unsigned long end = (unsigned long) &__end_rodata_hpage_align;
778 unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); 809 unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
779 unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); 810 unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
780 unsigned long data_start = (unsigned long) &_sdata; 811 unsigned long all_end = PFN_ALIGN(&_end);
781 812
782 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 813 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
783 (end - start) >> 10); 814 (end - start) >> 10);
@@ -786,10 +817,10 @@ void mark_rodata_ro(void)
786 kernel_set_to_readonly = 1; 817 kernel_set_to_readonly = 1;
787 818
788 /* 819 /*
789 * The rodata section (but not the kernel text!) should also be 820 * The rodata/data/bss/brk section (but not the kernel text!)
790 * not-executable. 821 * should also be not-executable.
791 */ 822 */
792 set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); 823 set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
793 824
794 rodata_test(); 825 rodata_test();
795 826
@@ -802,12 +833,12 @@ void mark_rodata_ro(void)
802#endif 833#endif
803 834
804 free_init_pages("unused kernel memory", 835 free_init_pages("unused kernel memory",
805 (unsigned long) page_address(virt_to_page(text_end)), 836 (unsigned long) __va(__pa_symbol(text_end)),
806 (unsigned long) 837 (unsigned long) __va(__pa_symbol(rodata_start)));
807 page_address(virt_to_page(rodata_start))); 838
808 free_init_pages("unused kernel memory", 839 free_init_pages("unused kernel memory",
809 (unsigned long) page_address(virt_to_page(rodata_end)), 840 (unsigned long) __va(__pa_symbol(rodata_end)),
810 (unsigned long) page_address(virt_to_page(data_start))); 841 (unsigned long) __va(__pa_symbol(_sdata)));
811} 842}
812 843
813#endif 844#endif
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
new file mode 100644
index 000000000000..6b563a118891
--- /dev/null
+++ b/arch/x86/mm/mm_internal.h
@@ -0,0 +1,19 @@
1#ifndef __X86_MM_INTERNAL_H
2#define __X86_MM_INTERNAL_H
3
4void *alloc_low_pages(unsigned int num);
5static inline void *alloc_low_page(void)
6{
7 return alloc_low_pages(1);
8}
9
10void early_ioremap_page_table_range_init(void);
11
12unsigned long kernel_physical_mapping_init(unsigned long start,
13 unsigned long end,
14 unsigned long page_size_mask);
15void zone_sizes_init(void);
16
17extern int after_bootmem;
18
19#endif /* __X86_MM_INTERNAL_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 2d125be1bae9..8504f3698753 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -193,7 +193,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
193static void __init setup_node_data(int nid, u64 start, u64 end) 193static void __init setup_node_data(int nid, u64 start, u64 end)
194{ 194{
195 const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 195 const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
196 bool remapped = false;
197 u64 nd_pa; 196 u64 nd_pa;
198 void *nd; 197 void *nd;
199 int tnid; 198 int tnid;
@@ -205,37 +204,28 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
205 if (end && (end - start) < NODE_MIN_SIZE) 204 if (end && (end - start) < NODE_MIN_SIZE)
206 return; 205 return;
207 206
208 /* initialize remap allocator before aligning to ZONE_ALIGN */
209 init_alloc_remap(nid, start, end);
210
211 start = roundup(start, ZONE_ALIGN); 207 start = roundup(start, ZONE_ALIGN);
212 208
213 printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", 209 printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
214 nid, start, end - 1); 210 nid, start, end - 1);
215 211
216 /* 212 /*
217 * Allocate node data. Try remap allocator first, node-local 213 * Allocate node data. Try node-local memory and then any node.
218 * memory and then any node. Never allocate in DMA zone. 214 * Never allocate in DMA zone.
219 */ 215 */
220 nd = alloc_remap(nid, nd_size); 216 nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
221 if (nd) { 217 if (!nd_pa) {
222 nd_pa = __pa(nd); 218 pr_err("Cannot find %zu bytes in node %d\n",
223 remapped = true; 219 nd_size, nid);
224 } else { 220 return;
225 nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
226 if (!nd_pa) {
227 pr_err("Cannot find %zu bytes in node %d\n",
228 nd_size, nid);
229 return;
230 }
231 nd = __va(nd_pa);
232 } 221 }
222 nd = __va(nd_pa);
233 223
234 /* report and initialize */ 224 /* report and initialize */
235 printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]%s\n", 225 printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n",
236 nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); 226 nd_pa, nd_pa + nd_size - 1);
237 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); 227 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
238 if (!remapped && tnid != nid) 228 if (tnid != nid)
239 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); 229 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
240 230
241 node_data[nid] = nd; 231 node_data[nid] = nd;
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 534255a36b6b..73a6d7395bd3 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -73,167 +73,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
73 73
74extern unsigned long highend_pfn, highstart_pfn; 74extern unsigned long highend_pfn, highstart_pfn;
75 75
76#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
77
78static void *node_remap_start_vaddr[MAX_NUMNODES];
79void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
80
81/*
82 * Remap memory allocator
83 */
84static unsigned long node_remap_start_pfn[MAX_NUMNODES];
85static void *node_remap_end_vaddr[MAX_NUMNODES];
86static void *node_remap_alloc_vaddr[MAX_NUMNODES];
87
88/**
89 * alloc_remap - Allocate remapped memory
90 * @nid: NUMA node to allocate memory from
91 * @size: The size of allocation
92 *
93 * Allocate @size bytes from the remap area of NUMA node @nid. The
94 * size of the remap area is predetermined by init_alloc_remap() and
95 * only the callers considered there should call this function. For
96 * more info, please read the comment on top of init_alloc_remap().
97 *
98 * The caller must be ready to handle allocation failure from this
99 * function and fall back to regular memory allocator in such cases.
100 *
101 * CONTEXT:
102 * Single CPU early boot context.
103 *
104 * RETURNS:
105 * Pointer to the allocated memory on success, %NULL on failure.
106 */
107void *alloc_remap(int nid, unsigned long size)
108{
109 void *allocation = node_remap_alloc_vaddr[nid];
110
111 size = ALIGN(size, L1_CACHE_BYTES);
112
113 if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
114 return NULL;
115
116 node_remap_alloc_vaddr[nid] += size;
117 memset(allocation, 0, size);
118
119 return allocation;
120}
121
122#ifdef CONFIG_HIBERNATION
123/**
124 * resume_map_numa_kva - add KVA mapping to the temporary page tables created
125 * during resume from hibernation
126 * @pgd_base - temporary resume page directory
127 */
128void resume_map_numa_kva(pgd_t *pgd_base)
129{
130 int node;
131
132 for_each_online_node(node) {
133 unsigned long start_va, start_pfn, nr_pages, pfn;
134
135 start_va = (unsigned long)node_remap_start_vaddr[node];
136 start_pfn = node_remap_start_pfn[node];
137 nr_pages = (node_remap_end_vaddr[node] -
138 node_remap_start_vaddr[node]) >> PAGE_SHIFT;
139
140 printk(KERN_DEBUG "%s: node %d\n", __func__, node);
141
142 for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
143 unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
144 pgd_t *pgd = pgd_base + pgd_index(vaddr);
145 pud_t *pud = pud_offset(pgd, vaddr);
146 pmd_t *pmd = pmd_offset(pud, vaddr);
147
148 set_pmd(pmd, pfn_pmd(start_pfn + pfn,
149 PAGE_KERNEL_LARGE_EXEC));
150
151 printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n",
152 __func__, vaddr, start_pfn + pfn);
153 }
154 }
155}
156#endif
157
158/**
159 * init_alloc_remap - Initialize remap allocator for a NUMA node
160 * @nid: NUMA node to initizlie remap allocator for
161 *
162 * NUMA nodes may end up without any lowmem. As allocating pgdat and
163 * memmap on a different node with lowmem is inefficient, a special
164 * remap allocator is implemented which can be used by alloc_remap().
165 *
166 * For each node, the amount of memory which will be necessary for
167 * pgdat and memmap is calculated and two memory areas of the size are
168 * allocated - one in the node and the other in lowmem; then, the area
169 * in the node is remapped to the lowmem area.
170 *
171 * As pgdat and memmap must be allocated in lowmem anyway, this
172 * doesn't waste lowmem address space; however, the actual lowmem
173 * which gets remapped over is wasted. The amount shouldn't be
174 * problematic on machines this feature will be used.
175 *
176 * Initialization failure isn't fatal. alloc_remap() is used
177 * opportunistically and the callers will fall back to other memory
178 * allocation mechanisms on failure.
179 */
180void __init init_alloc_remap(int nid, u64 start, u64 end)
181{
182 unsigned long start_pfn = start >> PAGE_SHIFT;
183 unsigned long end_pfn = end >> PAGE_SHIFT;
184 unsigned long size, pfn;
185 u64 node_pa, remap_pa;
186 void *remap_va;
187
188 /*
189 * The acpi/srat node info can show hot-add memroy zones where
190 * memory could be added but not currently present.
191 */
192 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
193 nid, start_pfn, end_pfn);
194
195 /* calculate the necessary space aligned to large page size */
196 size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
197 size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
198 size = ALIGN(size, LARGE_PAGE_BYTES);
199
200 /* allocate node memory and the lowmem remap area */
201 node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
202 if (!node_pa) {
203 pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
204 size, nid);
205 return;
206 }
207 memblock_reserve(node_pa, size);
208
209 remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
210 max_low_pfn << PAGE_SHIFT,
211 size, LARGE_PAGE_BYTES);
212 if (!remap_pa) {
213 pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
214 size, nid);
215 memblock_free(node_pa, size);
216 return;
217 }
218 memblock_reserve(remap_pa, size);
219 remap_va = phys_to_virt(remap_pa);
220
221 /* perform actual remap */
222 for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
223 set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
224 (node_pa >> PAGE_SHIFT) + pfn,
225 PAGE_KERNEL_LARGE);
226
227 /* initialize remap allocator parameters */
228 node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
229 node_remap_start_vaddr[nid] = remap_va;
230 node_remap_end_vaddr[nid] = remap_va + size;
231 node_remap_alloc_vaddr[nid] = remap_va;
232
233 printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
234 nid, node_pa, node_pa + size, remap_va, remap_va + size);
235}
236
237void __init initmem_init(void) 76void __init initmem_init(void)
238{ 77{
239 x86_numa_init(); 78 x86_numa_init();
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 92e27119ee1a..9405ffc91502 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -10,16 +10,3 @@ void __init initmem_init(void)
10{ 10{
11 x86_numa_init(); 11 x86_numa_init();
12} 12}
13
14unsigned long __init numa_free_all_bootmem(void)
15{
16 unsigned long pages = 0;
17 int i;
18
19 for_each_online_node(i)
20 pages += free_all_bootmem_node(NODE_DATA(i));
21
22 pages += free_low_memory_core_early(MAX_NUMNODES);
23
24 return pages;
25}
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index 7178c3afe05e..ad86ec91e640 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -21,12 +21,6 @@ void __init numa_reset_distance(void);
21 21
22void __init x86_numa_init(void); 22void __init x86_numa_init(void);
23 23
24#ifdef CONFIG_X86_64
25static inline void init_alloc_remap(int nid, u64 start, u64 end) { }
26#else
27void __init init_alloc_remap(int nid, u64 start, u64 end);
28#endif
29
30#ifdef CONFIG_NUMA_EMU 24#ifdef CONFIG_NUMA_EMU
31void __init numa_emulation(struct numa_meminfo *numa_meminfo, 25void __init numa_emulation(struct numa_meminfo *numa_meminfo,
32 int numa_dist_cnt); 26 int numa_dist_cnt);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a718e0d23503..a1b1c88f9caf 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -94,12 +94,12 @@ static inline void split_page_count(int level) { }
94 94
95static inline unsigned long highmap_start_pfn(void) 95static inline unsigned long highmap_start_pfn(void)
96{ 96{
97 return __pa(_text) >> PAGE_SHIFT; 97 return __pa_symbol(_text) >> PAGE_SHIFT;
98} 98}
99 99
100static inline unsigned long highmap_end_pfn(void) 100static inline unsigned long highmap_end_pfn(void)
101{ 101{
102 return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; 102 return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
103} 103}
104 104
105#endif 105#endif
@@ -276,8 +276,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
276 * The .rodata section needs to be read-only. Using the pfn 276 * The .rodata section needs to be read-only. Using the pfn
277 * catches all aliases. 277 * catches all aliases.
278 */ 278 */
279 if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, 279 if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
280 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) 280 __pa_symbol(__end_rodata) >> PAGE_SHIFT))
281 pgprot_val(forbidden) |= _PAGE_RW; 281 pgprot_val(forbidden) |= _PAGE_RW;
282 282
283#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) 283#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
@@ -364,6 +364,37 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
364EXPORT_SYMBOL_GPL(lookup_address); 364EXPORT_SYMBOL_GPL(lookup_address);
365 365
366/* 366/*
367 * This is necessary because __pa() does not work on some
368 * kinds of memory, like vmalloc() or the alloc_remap()
369 * areas on 32-bit NUMA systems. The percpu areas can
370 * end up in this kind of memory, for instance.
371 *
372 * This could be optimized, but it is only intended to be
373 * used at inititalization time, and keeping it
374 * unoptimized should increase the testing coverage for
375 * the more obscure platforms.
376 */
377phys_addr_t slow_virt_to_phys(void *__virt_addr)
378{
379 unsigned long virt_addr = (unsigned long)__virt_addr;
380 phys_addr_t phys_addr;
381 unsigned long offset;
382 enum pg_level level;
383 unsigned long psize;
384 unsigned long pmask;
385 pte_t *pte;
386
387 pte = lookup_address(virt_addr, &level);
388 BUG_ON(!pte);
389 psize = page_level_size(level);
390 pmask = page_level_mask(level);
391 offset = virt_addr & ~pmask;
392 phys_addr = pte_pfn(*pte) << PAGE_SHIFT;
393 return (phys_addr | offset);
394}
395EXPORT_SYMBOL_GPL(slow_virt_to_phys);
396
397/*
367 * Set the new pmd in all the pgds we know about: 398 * Set the new pmd in all the pgds we know about:
368 */ 399 */
369static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) 400static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
@@ -396,7 +427,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
396 pte_t new_pte, old_pte, *tmp; 427 pte_t new_pte, old_pte, *tmp;
397 pgprot_t old_prot, new_prot, req_prot; 428 pgprot_t old_prot, new_prot, req_prot;
398 int i, do_split = 1; 429 int i, do_split = 1;
399 unsigned int level; 430 enum pg_level level;
400 431
401 if (cpa->force_split) 432 if (cpa->force_split)
402 return 1; 433 return 1;
@@ -412,15 +443,12 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
412 443
413 switch (level) { 444 switch (level) {
414 case PG_LEVEL_2M: 445 case PG_LEVEL_2M:
415 psize = PMD_PAGE_SIZE;
416 pmask = PMD_PAGE_MASK;
417 break;
418#ifdef CONFIG_X86_64 446#ifdef CONFIG_X86_64
419 case PG_LEVEL_1G: 447 case PG_LEVEL_1G:
420 psize = PUD_PAGE_SIZE;
421 pmask = PUD_PAGE_MASK;
422 break;
423#endif 448#endif
449 psize = page_level_size(level);
450 pmask = page_level_mask(level);
451 break;
424 default: 452 default:
425 do_split = -EINVAL; 453 do_split = -EINVAL;
426 goto out_unlock; 454 goto out_unlock;
@@ -551,16 +579,10 @@ static int split_large_page(pte_t *kpte, unsigned long address)
551 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) 579 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
552 set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); 580 set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
553 581
554 if (address >= (unsigned long)__va(0) && 582 if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
555 address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT)) 583 PFN_DOWN(__pa(address)) + 1))
556 split_page_count(level); 584 split_page_count(level);
557 585
558#ifdef CONFIG_X86_64
559 if (address >= (unsigned long)__va(1UL<<32) &&
560 address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
561 split_page_count(level);
562#endif
563
564 /* 586 /*
565 * Install the new, split up pagetable. 587 * Install the new, split up pagetable.
566 * 588 *
@@ -729,13 +751,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
729 unsigned long vaddr; 751 unsigned long vaddr;
730 int ret; 752 int ret;
731 753
732 if (cpa->pfn >= max_pfn_mapped) 754 if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
733 return 0; 755 return 0;
734 756
735#ifdef CONFIG_X86_64
736 if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
737 return 0;
738#endif
739 /* 757 /*
740 * No need to redo, when the primary call touched the direct 758 * No need to redo, when the primary call touched the direct
741 * mapping already: 759 * mapping already:
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 0eb572eda406..2610bd93c896 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -560,10 +560,10 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
560{ 560{
561 unsigned long id_sz; 561 unsigned long id_sz;
562 562
563 if (base >= __pa(high_memory)) 563 if (base > __pa(high_memory-1))
564 return 0; 564 return 0;
565 565
566 id_sz = (__pa(high_memory) < base + size) ? 566 id_sz = (__pa(high_memory-1) <= base + size) ?
567 __pa(high_memory) - base : 567 __pa(high_memory) - base :
568 size; 568 size;
569 569
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e27fbf887f3b..193350b51f90 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -334,7 +334,12 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
334 if (changed && dirty) { 334 if (changed && dirty) {
335 *pmdp = entry; 335 *pmdp = entry;
336 pmd_update_defer(vma->vm_mm, address, pmdp); 336 pmd_update_defer(vma->vm_mm, address, pmdp);
337 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 337 /*
338 * We had a write-protection fault here and changed the pmd
339 * to to more permissive. No need to flush the TLB for that,
340 * #PF is architecturally guaranteed to do that and in the
341 * worst-case we'll generate a spurious fault.
342 */
338 } 343 }
339 344
340 return changed; 345 return changed;
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index d2e2735327b4..e666cbbb9261 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -1,3 +1,4 @@
1#include <linux/bootmem.h>
1#include <linux/mmdebug.h> 2#include <linux/mmdebug.h>
2#include <linux/module.h> 3#include <linux/module.h>
3#include <linux/mm.h> 4#include <linux/mm.h>
@@ -8,33 +9,54 @@
8 9
9#ifdef CONFIG_X86_64 10#ifdef CONFIG_X86_64
10 11
12#ifdef CONFIG_DEBUG_VIRTUAL
11unsigned long __phys_addr(unsigned long x) 13unsigned long __phys_addr(unsigned long x)
12{ 14{
13 if (x >= __START_KERNEL_map) { 15 unsigned long y = x - __START_KERNEL_map;
14 x -= __START_KERNEL_map; 16
15 VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); 17 /* use the carry flag to determine if x was < __START_KERNEL_map */
16 x += phys_base; 18 if (unlikely(x > y)) {
19 x = y + phys_base;
20
21 VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
17 } else { 22 } else {
18 VIRTUAL_BUG_ON(x < PAGE_OFFSET); 23 x = y + (__START_KERNEL_map - PAGE_OFFSET);
19 x -= PAGE_OFFSET; 24
20 VIRTUAL_BUG_ON(!phys_addr_valid(x)); 25 /* carry flag will be set if starting x was >= PAGE_OFFSET */
26 VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
21 } 27 }
28
22 return x; 29 return x;
23} 30}
24EXPORT_SYMBOL(__phys_addr); 31EXPORT_SYMBOL(__phys_addr);
25 32
33unsigned long __phys_addr_symbol(unsigned long x)
34{
35 unsigned long y = x - __START_KERNEL_map;
36
37 /* only check upper bounds since lower bounds will trigger carry */
38 VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
39
40 return y + phys_base;
41}
42EXPORT_SYMBOL(__phys_addr_symbol);
43#endif
44
26bool __virt_addr_valid(unsigned long x) 45bool __virt_addr_valid(unsigned long x)
27{ 46{
28 if (x >= __START_KERNEL_map) { 47 unsigned long y = x - __START_KERNEL_map;
29 x -= __START_KERNEL_map; 48
30 if (x >= KERNEL_IMAGE_SIZE) 49 /* use the carry flag to determine if x was < __START_KERNEL_map */
50 if (unlikely(x > y)) {
51 x = y + phys_base;
52
53 if (y >= KERNEL_IMAGE_SIZE)
31 return false; 54 return false;
32 x += phys_base;
33 } else { 55 } else {
34 if (x < PAGE_OFFSET) 56 x = y + (__START_KERNEL_map - PAGE_OFFSET);
35 return false; 57
36 x -= PAGE_OFFSET; 58 /* carry flag will be set if starting x was >= PAGE_OFFSET */
37 if (!phys_addr_valid(x)) 59 if ((x > y) || !phys_addr_valid(x))
38 return false; 60 return false;
39 } 61 }
40 62
@@ -47,10 +69,16 @@ EXPORT_SYMBOL(__virt_addr_valid);
47#ifdef CONFIG_DEBUG_VIRTUAL 69#ifdef CONFIG_DEBUG_VIRTUAL
48unsigned long __phys_addr(unsigned long x) 70unsigned long __phys_addr(unsigned long x)
49{ 71{
72 unsigned long phys_addr = x - PAGE_OFFSET;
50 /* VMALLOC_* aren't constants */ 73 /* VMALLOC_* aren't constants */
51 VIRTUAL_BUG_ON(x < PAGE_OFFSET); 74 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
52 VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); 75 VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
53 return x - PAGE_OFFSET; 76 /* max_low_pfn is set early, but not _that_ early */
77 if (max_low_pfn) {
78 VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
79 BUG_ON(slow_virt_to_phys((void *)x) != phys_addr);
80 }
81 return phys_addr;
54} 82}
55EXPORT_SYMBOL(__phys_addr); 83EXPORT_SYMBOL(__phys_addr);
56#endif 84#endif