aboutsummaryrefslogtreecommitdiffstats
path: root/arch/tile/kernel/setup.c
diff options
context:
space:
mode:
authorChris Metcalf <cmetcalf@tilera.com>2010-05-28 23:09:12 -0400
committerChris Metcalf <cmetcalf@tilera.com>2010-06-04 17:11:18 -0400
commit867e359b97c970a60626d5d76bbe2a8fadbf38fb (patch)
treec5ccbb7f5172e8555977119608ecb1eee3cc37e3 /arch/tile/kernel/setup.c
parent5360bd776f73d0a7da571d72a09a03f237e99900 (diff)
arch/tile: core support for Tilera 32-bit chips.
This change is the core kernel support for TILEPro and TILE64 chips. No driver support (except the console driver) is included yet. This includes the relevant Linux headers in asm/; the low-level low-level "Tile architecture" headers in arch/, which are shared with the hypervisor, etc., and are build-system agnostic; and the relevant hypervisor headers in hv/. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> Acked-by: Arnd Bergmann <arnd@arndb.de> Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Reviewed-by: Paul Mundt <lethal@linux-sh.org>
Diffstat (limited to 'arch/tile/kernel/setup.c')
-rw-r--r--arch/tile/kernel/setup.c1497
1 files changed, 1497 insertions, 0 deletions
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
new file mode 100644
index 000000000000..934136b61ceb
--- /dev/null
+++ b/arch/tile/kernel/setup.c
@@ -0,0 +1,1497 @@
1/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/sched.h>
16#include <linux/kernel.h>
17#include <linux/mmzone.h>
18#include <linux/bootmem.h>
19#include <linux/module.h>
20#include <linux/node.h>
21#include <linux/cpu.h>
22#include <linux/ioport.h>
23#include <linux/kexec.h>
24#include <linux/pci.h>
25#include <linux/initrd.h>
26#include <linux/io.h>
27#include <linux/highmem.h>
28#include <linux/smp.h>
29#include <linux/timex.h>
30#include <asm/setup.h>
31#include <asm/sections.h>
32#include <asm/sections.h>
33#include <asm/cacheflush.h>
34#include <asm/cacheflush.h>
35#include <asm/pgalloc.h>
36#include <asm/mmu_context.h>
37#include <hv/hypervisor.h>
38#include <arch/interrupts.h>
39
40/* <linux/smp.h> doesn't provide this definition. */
41#ifndef CONFIG_SMP
42#define setup_max_cpus 1
43#endif
44
45static inline int ABS(int x) { return x >= 0 ? x : -x; }
46
47/* Chip information */
48char chip_model[64] __write_once;
49
50struct pglist_data node_data[MAX_NUMNODES] __read_mostly;
51EXPORT_SYMBOL(node_data);
52
53/* We only create bootmem data on node 0. */
54static bootmem_data_t __initdata node0_bdata;
55
56/* Information on the NUMA nodes that we compute early */
57unsigned long __cpuinitdata node_start_pfn[MAX_NUMNODES];
58unsigned long __cpuinitdata node_end_pfn[MAX_NUMNODES];
59unsigned long __initdata node_memmap_pfn[MAX_NUMNODES];
60unsigned long __initdata node_percpu_pfn[MAX_NUMNODES];
61unsigned long __initdata node_free_pfn[MAX_NUMNODES];
62
63#ifdef CONFIG_HIGHMEM
64/* Page frame index of end of lowmem on each controller. */
65unsigned long __cpuinitdata node_lowmem_end_pfn[MAX_NUMNODES];
66
67/* Number of pages that can be mapped into lowmem. */
68static unsigned long __initdata mappable_physpages;
69#endif
70
71/* Data on which physical memory controller corresponds to which NUMA node */
72int node_controller[MAX_NUMNODES] = { [0 ... MAX_NUMNODES-1] = -1 };
73
74#ifdef CONFIG_HIGHMEM
75/* Map information from VAs to PAs */
76unsigned long pbase_map[1 << (32 - HPAGE_SHIFT)]
77 __write_once __attribute__((aligned(L2_CACHE_BYTES)));
78EXPORT_SYMBOL(pbase_map);
79
80/* Map information from PAs to VAs */
81void *vbase_map[NR_PA_HIGHBIT_VALUES]
82 __write_once __attribute__((aligned(L2_CACHE_BYTES)));
83EXPORT_SYMBOL(vbase_map);
84#endif
85
86/* Node number as a function of the high PA bits */
87int highbits_to_node[NR_PA_HIGHBIT_VALUES] __write_once;
88EXPORT_SYMBOL(highbits_to_node);
89
90static unsigned int __initdata maxmem_pfn = -1U;
91static unsigned int __initdata maxnodemem_pfn[MAX_NUMNODES] = {
92 [0 ... MAX_NUMNODES-1] = -1U
93};
94static nodemask_t __initdata isolnodes;
95
96#ifdef CONFIG_PCI
97enum { DEFAULT_PCI_RESERVE_MB = 64 };
98static unsigned int __initdata pci_reserve_mb = DEFAULT_PCI_RESERVE_MB;
99unsigned long __initdata pci_reserve_start_pfn = -1U;
100unsigned long __initdata pci_reserve_end_pfn = -1U;
101#endif
102
103static int __init setup_maxmem(char *str)
104{
105 long maxmem_mb;
106 if (str == NULL || strict_strtol(str, 0, &maxmem_mb) != 0 ||
107 maxmem_mb == 0)
108 return -EINVAL;
109
110 maxmem_pfn = (maxmem_mb >> (HPAGE_SHIFT - 20)) <<
111 (HPAGE_SHIFT - PAGE_SHIFT);
112 printk("Forcing RAM used to no more than %dMB\n",
113 maxmem_pfn >> (20 - PAGE_SHIFT));
114 return 0;
115}
116early_param("maxmem", setup_maxmem);
117
118static int __init setup_maxnodemem(char *str)
119{
120 char *endp;
121 long maxnodemem_mb, node;
122
123 node = str ? simple_strtoul(str, &endp, 0) : INT_MAX;
124 if (node >= MAX_NUMNODES || *endp != ':' ||
125 strict_strtol(endp+1, 0, &maxnodemem_mb) != 0)
126 return -EINVAL;
127
128 maxnodemem_pfn[node] = (maxnodemem_mb >> (HPAGE_SHIFT - 20)) <<
129 (HPAGE_SHIFT - PAGE_SHIFT);
130 printk("Forcing RAM used on node %ld to no more than %dMB\n",
131 node, maxnodemem_pfn[node] >> (20 - PAGE_SHIFT));
132 return 0;
133}
134early_param("maxnodemem", setup_maxnodemem);
135
136static int __init setup_isolnodes(char *str)
137{
138 char buf[MAX_NUMNODES * 5];
139 if (str == NULL || nodelist_parse(str, isolnodes) != 0)
140 return -EINVAL;
141
142 nodelist_scnprintf(buf, sizeof(buf), isolnodes);
143 printk("Set isolnodes value to '%s'\n", buf);
144 return 0;
145}
146early_param("isolnodes", setup_isolnodes);
147
148#ifdef CONFIG_PCI
149static int __init setup_pci_reserve(char* str)
150{
151 unsigned long mb;
152
153 if (str == NULL || strict_strtoul(str, 0, &mb) != 0 ||
154 mb > 3 * 1024)
155 return -EINVAL;
156
157 pci_reserve_mb = mb;
158 printk("Reserving %dMB for PCIE root complex mappings\n",
159 pci_reserve_mb);
160 return 0;
161}
162early_param("pci_reserve", setup_pci_reserve);
163#endif
164
165#ifndef __tilegx__
166/*
167 * vmalloc=size forces the vmalloc area to be exactly 'size' bytes.
168 * This can be used to increase (or decrease) the vmalloc area.
169 */
170static int __init parse_vmalloc(char *arg)
171{
172 if (!arg)
173 return -EINVAL;
174
175 VMALLOC_RESERVE = (memparse(arg, &arg) + PGDIR_SIZE - 1) & PGDIR_MASK;
176
177 /* See validate_va() for more on this test. */
178 if ((long)_VMALLOC_START >= 0)
179 early_panic("\"vmalloc=%#lx\" value too large: maximum %#lx\n",
180 VMALLOC_RESERVE, _VMALLOC_END - 0x80000000UL);
181
182 return 0;
183}
184early_param("vmalloc", parse_vmalloc);
185#endif
186
187#ifdef CONFIG_HIGHMEM
188/*
189 * Determine for each controller where its lowmem is mapped and how
190 * much of it is mapped there. On controller zero, the first few
191 * megabytes are mapped at 0xfd000000 as code, so in principle we
192 * could start our data mappings higher up, but for now we don't
193 * bother, to avoid additional confusion.
194 *
195 * One question is whether, on systems with more than 768 Mb and
196 * controllers of different sizes, to map in a proportionate amount of
197 * each one, or to try to map the same amount from each controller.
198 * (E.g. if we have three controllers with 256MB, 1GB, and 256MB
199 * respectively, do we map 256MB from each, or do we map 128 MB, 512
200 * MB, and 128 MB respectively?) For now we use a proportionate
201 * solution like the latter.
202 *
203 * The VA/PA mapping demands that we align our decisions at 16 MB
204 * boundaries so that we can rapidly convert VA to PA.
205 */
206static void *__init setup_pa_va_mapping(void)
207{
208 unsigned long curr_pages = 0;
209 unsigned long vaddr = PAGE_OFFSET;
210 nodemask_t highonlynodes = isolnodes;
211 int i, j;
212
213 memset(pbase_map, -1, sizeof(pbase_map));
214 memset(vbase_map, -1, sizeof(vbase_map));
215
216 /* Node zero cannot be isolated for LOWMEM purposes. */
217 node_clear(0, highonlynodes);
218
219 /* Count up the number of pages on non-highonlynodes controllers. */
220 mappable_physpages = 0;
221 for_each_online_node(i) {
222 if (!node_isset(i, highonlynodes))
223 mappable_physpages +=
224 node_end_pfn[i] - node_start_pfn[i];
225 }
226
227 for_each_online_node(i) {
228 unsigned long start = node_start_pfn[i];
229 unsigned long end = node_end_pfn[i];
230 unsigned long size = end - start;
231 unsigned long vaddr_end;
232
233 if (node_isset(i, highonlynodes)) {
234 /* Mark this controller as having no lowmem. */
235 node_lowmem_end_pfn[i] = start;
236 continue;
237 }
238
239 curr_pages += size;
240 if (mappable_physpages > MAXMEM_PFN) {
241 vaddr_end = PAGE_OFFSET +
242 (((u64)curr_pages * MAXMEM_PFN /
243 mappable_physpages)
244 << PAGE_SHIFT);
245 } else {
246 vaddr_end = PAGE_OFFSET + (curr_pages << PAGE_SHIFT);
247 }
248 for (j = 0; vaddr < vaddr_end; vaddr += HPAGE_SIZE, ++j) {
249 unsigned long this_pfn =
250 start + (j << HUGETLB_PAGE_ORDER);
251 pbase_map[vaddr >> HPAGE_SHIFT] = this_pfn;
252 if (vbase_map[__pfn_to_highbits(this_pfn)] ==
253 (void *)-1)
254 vbase_map[__pfn_to_highbits(this_pfn)] =
255 (void *)(vaddr & HPAGE_MASK);
256 }
257 node_lowmem_end_pfn[i] = start + (j << HUGETLB_PAGE_ORDER);
258 BUG_ON(node_lowmem_end_pfn[i] > end);
259 }
260
261 /* Return highest address of any mapped memory. */
262 return (void *)vaddr;
263}
264#endif /* CONFIG_HIGHMEM */
265
266/*
267 * Register our most important memory mappings with the debug stub.
268 *
269 * This is up to 4 mappings for lowmem, one mapping per memory
270 * controller, plus one for our text segment.
271 */
272void __cpuinit store_permanent_mappings(void)
273{
274 int i;
275
276 for_each_online_node(i) {
277 HV_PhysAddr pa = ((HV_PhysAddr)node_start_pfn[i]) << PAGE_SHIFT;
278#ifdef CONFIG_HIGHMEM
279 HV_PhysAddr high_mapped_pa = node_lowmem_end_pfn[i];
280#else
281 HV_PhysAddr high_mapped_pa = node_end_pfn[i];
282#endif
283
284 unsigned long pages = high_mapped_pa - node_start_pfn[i];
285 HV_VirtAddr addr = (HV_VirtAddr) __va(pa);
286 hv_store_mapping(addr, pages << PAGE_SHIFT, pa);
287 }
288
289 hv_store_mapping((HV_VirtAddr)_stext,
290 (uint32_t)(_einittext - _stext), 0);
291}
292
293/*
294 * Use hv_inquire_physical() to populate node_{start,end}_pfn[]
295 * and node_online_map, doing suitable sanity-checking.
296 * Also set min_low_pfn, max_low_pfn, and max_pfn.
297 */
298static void __init setup_memory(void)
299{
300 int i, j;
301 int highbits_seen[NR_PA_HIGHBIT_VALUES] = { 0 };
302#ifdef CONFIG_HIGHMEM
303 long highmem_pages;
304#endif
305#ifndef __tilegx__
306 int cap;
307#endif
308#if defined(CONFIG_HIGHMEM) || defined(__tilegx__)
309 long lowmem_pages;
310#endif
311
312 /* We are using a char to hold the cpu_2_node[] mapping */
313 BUG_ON(MAX_NUMNODES > 127);
314
315 /* Discover the ranges of memory available to us */
316 for (i = 0; ; ++i) {
317 unsigned long start, size, end, highbits;
318 HV_PhysAddrRange range = hv_inquire_physical(i);
319 if (range.size == 0)
320 break;
321#ifdef CONFIG_FLATMEM
322 if (i > 0) {
323 printk("Can't use discontiguous PAs: %#llx..%#llx\n",
324 range.size, range.start + range.size);
325 continue;
326 }
327#endif
328#ifndef __tilegx__
329 if ((unsigned long)range.start) {
330 printk("Range not at 4GB multiple: %#llx..%#llx\n",
331 range.start, range.start + range.size);
332 continue;
333 }
334#endif
335 if ((range.start & (HPAGE_SIZE-1)) != 0 ||
336 (range.size & (HPAGE_SIZE-1)) != 0) {
337 unsigned long long start_pa = range.start;
338 unsigned long long size = range.size;
339 range.start = (start_pa + HPAGE_SIZE - 1) & HPAGE_MASK;
340 range.size -= (range.start - start_pa);
341 range.size &= HPAGE_MASK;
342 printk("Range not hugepage-aligned: %#llx..%#llx:"
343 " now %#llx-%#llx\n",
344 start_pa, start_pa + size,
345 range.start, range.start + range.size);
346 }
347 highbits = __pa_to_highbits(range.start);
348 if (highbits >= NR_PA_HIGHBIT_VALUES) {
349 printk("PA high bits too high: %#llx..%#llx\n",
350 range.start, range.start + range.size);
351 continue;
352 }
353 if (highbits_seen[highbits]) {
354 printk("Range overlaps in high bits: %#llx..%#llx\n",
355 range.start, range.start + range.size);
356 continue;
357 }
358 highbits_seen[highbits] = 1;
359 if (PFN_DOWN(range.size) > maxnodemem_pfn[i]) {
360 int size = maxnodemem_pfn[i];
361 if (size > 0) {
362 printk("Maxnodemem reduced node %d to"
363 " %d pages\n", i, size);
364 range.size = (HV_PhysAddr)size << PAGE_SHIFT;
365 } else {
366 printk("Maxnodemem disabled node %d\n", i);
367 continue;
368 }
369 }
370 if (num_physpages + PFN_DOWN(range.size) > maxmem_pfn) {
371 int size = maxmem_pfn - num_physpages;
372 if (size > 0) {
373 printk("Maxmem reduced node %d to %d pages\n",
374 i, size);
375 range.size = (HV_PhysAddr)size << PAGE_SHIFT;
376 } else {
377 printk("Maxmem disabled node %d\n", i);
378 continue;
379 }
380 }
381 if (i >= MAX_NUMNODES) {
382 printk("Too many PA nodes (#%d): %#llx...%#llx\n",
383 i, range.size, range.size + range.start);
384 continue;
385 }
386
387 start = range.start >> PAGE_SHIFT;
388 size = range.size >> PAGE_SHIFT;
389 end = start + size;
390
391#ifndef __tilegx__
392 if (((HV_PhysAddr)end << PAGE_SHIFT) !=
393 (range.start + range.size)) {
394 printk("PAs too high to represent: %#llx..%#llx\n",
395 range.start, range.start + range.size);
396 continue;
397 }
398#endif
399#ifdef CONFIG_PCI
400 /*
401 * Blocks that overlap the pci reserved region must
402 * have enough space to hold the maximum percpu data
403 * region at the top of the range. If there isn't
404 * enough space above the reserved region, just
405 * truncate the node.
406 */
407 if (start <= pci_reserve_start_pfn &&
408 end > pci_reserve_start_pfn) {
409 unsigned int per_cpu_size =
410 __per_cpu_end - __per_cpu_start;
411 unsigned int percpu_pages =
412 NR_CPUS * (PFN_UP(per_cpu_size) >> PAGE_SHIFT);
413 if (end < pci_reserve_end_pfn + percpu_pages) {
414 end = pci_reserve_start_pfn;
415 printk("PCI mapping region reduced node %d to"
416 " %ld pages\n", i, end - start);
417 }
418 }
419#endif
420
421 for (j = __pfn_to_highbits(start);
422 j <= __pfn_to_highbits(end - 1); j++)
423 highbits_to_node[j] = i;
424
425 node_start_pfn[i] = start;
426 node_end_pfn[i] = end;
427 node_controller[i] = range.controller;
428 num_physpages += size;
429 max_pfn = end;
430
431 /* Mark node as online */
432 node_set(i, node_online_map);
433 node_set(i, node_possible_map);
434 }
435
436#ifndef __tilegx__
437 /*
438 * For 4KB pages, mem_map "struct page" data is 1% of the size
439 * of the physical memory, so can be quite big (640 MB for
440 * four 16G zones). These structures must be mapped in
441 * lowmem, and since we currently cap out at about 768 MB,
442 * it's impractical to try to use this much address space.
443 * For now, arbitrarily cap the amount of physical memory
444 * we're willing to use at 8 million pages (32GB of 4KB pages).
445 */
446 cap = 8 * 1024 * 1024; /* 8 million pages */
447 if (num_physpages > cap) {
448 int num_nodes = num_online_nodes();
449 int cap_each = cap / num_nodes;
450 unsigned long dropped_pages = 0;
451 for (i = 0; i < num_nodes; ++i) {
452 int size = node_end_pfn[i] - node_start_pfn[i];
453 if (size > cap_each) {
454 dropped_pages += (size - cap_each);
455 node_end_pfn[i] = node_start_pfn[i] + cap_each;
456 }
457 }
458 num_physpages -= dropped_pages;
459 printk(KERN_WARNING "Only using %ldMB memory;"
460 " ignoring %ldMB.\n",
461 num_physpages >> (20 - PAGE_SHIFT),
462 dropped_pages >> (20 - PAGE_SHIFT));
463 printk(KERN_WARNING "Consider using a larger page size.\n");
464 }
465#endif
466
467 /* Heap starts just above the last loaded address. */
468 min_low_pfn = PFN_UP((unsigned long)_end - PAGE_OFFSET);
469
470#ifdef CONFIG_HIGHMEM
471 /* Find where we map lowmem from each controller. */
472 high_memory = setup_pa_va_mapping();
473
474 /* Set max_low_pfn based on what node 0 can directly address. */
475 max_low_pfn = node_lowmem_end_pfn[0];
476
477 lowmem_pages = (mappable_physpages > MAXMEM_PFN) ?
478 MAXMEM_PFN : mappable_physpages;
479 highmem_pages = (long) (num_physpages - lowmem_pages);
480
481 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
482 pages_to_mb(highmem_pages > 0 ? highmem_pages : 0));
483 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
484 pages_to_mb(lowmem_pages));
485#else
486 /* Set max_low_pfn based on what node 0 can directly address. */
487 max_low_pfn = node_end_pfn[0];
488
489#ifndef __tilegx__
490 if (node_end_pfn[0] > MAXMEM_PFN) {
491 printk(KERN_WARNING "Only using %ldMB LOWMEM.\n",
492 MAXMEM>>20);
493 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
494 max_low_pfn = MAXMEM_PFN;
495 max_pfn = MAXMEM_PFN;
496 num_physpages = MAXMEM_PFN;
497 node_end_pfn[0] = MAXMEM_PFN;
498 } else {
499 printk(KERN_NOTICE "%ldMB memory available.\n",
500 pages_to_mb(node_end_pfn[0]));
501 }
502 for (i = 1; i < MAX_NUMNODES; ++i) {
503 node_start_pfn[i] = 0;
504 node_end_pfn[i] = 0;
505 }
506 high_memory = __va(node_end_pfn[0]);
507#else
508 lowmem_pages = 0;
509 for (i = 0; i < MAX_NUMNODES; ++i) {
510 int pages = node_end_pfn[i] - node_start_pfn[i];
511 lowmem_pages += pages;
512 if (pages)
513 high_memory = pfn_to_kaddr(node_end_pfn[i]);
514 }
515 printk(KERN_NOTICE "%ldMB memory available.\n",
516 pages_to_mb(lowmem_pages));
517#endif
518#endif
519}
520
521static void __init setup_bootmem_allocator(void)
522{
523 unsigned long bootmap_size, first_alloc_pfn, last_alloc_pfn;
524
525 /* Provide a node 0 bdata. */
526 NODE_DATA(0)->bdata = &node0_bdata;
527
528#ifdef CONFIG_PCI
529 /* Don't let boot memory alias the PCI region. */
530 last_alloc_pfn = min(max_low_pfn, pci_reserve_start_pfn);
531#else
532 last_alloc_pfn = max_low_pfn;
533#endif
534
535 /*
536 * Initialize the boot-time allocator (with low memory only):
537 * The first argument says where to put the bitmap, and the
538 * second says where the end of allocatable memory is.
539 */
540 bootmap_size = init_bootmem(min_low_pfn, last_alloc_pfn);
541
542 /*
543 * Let the bootmem allocator use all the space we've given it
544 * except for its own bitmap.
545 */
546 first_alloc_pfn = min_low_pfn + PFN_UP(bootmap_size);
547 if (first_alloc_pfn >= last_alloc_pfn)
548 early_panic("Not enough memory on controller 0 for bootmem\n");
549
550 free_bootmem(PFN_PHYS(first_alloc_pfn),
551 PFN_PHYS(last_alloc_pfn - first_alloc_pfn));
552
553#ifdef CONFIG_KEXEC
554 if (crashk_res.start != crashk_res.end)
555 reserve_bootmem(crashk_res.start,
556 crashk_res.end - crashk_res.start + 1, 0);
557#endif
558
559}
560
561void *__init alloc_remap(int nid, unsigned long size)
562{
563 int pages = node_end_pfn[nid] - node_start_pfn[nid];
564 void *map = pfn_to_kaddr(node_memmap_pfn[nid]);
565 BUG_ON(size != pages * sizeof(struct page));
566 memset(map, 0, size);
567 return map;
568}
569
570static int __init percpu_size(void)
571{
572 int size = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE);
573#ifdef CONFIG_MODULES
574 if (size < PERCPU_ENOUGH_ROOM)
575 size = PERCPU_ENOUGH_ROOM;
576#endif
577 /* In several places we assume the per-cpu data fits on a huge page. */
578 BUG_ON(kdata_huge && size > HPAGE_SIZE);
579 return size;
580}
581
582static inline unsigned long alloc_bootmem_pfn(int size, unsigned long goal)
583{
584 void *kva = __alloc_bootmem(size, PAGE_SIZE, goal);
585 unsigned long pfn = kaddr_to_pfn(kva);
586 BUG_ON(goal && PFN_PHYS(pfn) != goal);
587 return pfn;
588}
589
590static void __init zone_sizes_init(void)
591{
592 unsigned long zones_size[MAX_NR_ZONES] = { 0 };
593 unsigned long node_percpu[MAX_NUMNODES] = { 0 };
594 int size = percpu_size();
595 int num_cpus = smp_height * smp_width;
596 int i;
597
598 for (i = 0; i < num_cpus; ++i)
599 node_percpu[cpu_to_node(i)] += size;
600
601 for_each_online_node(i) {
602 unsigned long start = node_start_pfn[i];
603 unsigned long end = node_end_pfn[i];
604#ifdef CONFIG_HIGHMEM
605 unsigned long lowmem_end = node_lowmem_end_pfn[i];
606#else
607 unsigned long lowmem_end = end;
608#endif
609 int memmap_size = (end - start) * sizeof(struct page);
610 node_free_pfn[i] = start;
611
612 /*
613 * Set aside pages for per-cpu data and the mem_map array.
614 *
615 * Since the per-cpu data requires special homecaching,
616 * if we are in kdata_huge mode, we put it at the end of
617 * the lowmem region. If we're not in kdata_huge mode,
618 * we take the per-cpu pages from the bottom of the
619 * controller, since that avoids fragmenting a huge page
620 * that users might want. We always take the memmap
621 * from the bottom of the controller, since with
622 * kdata_huge that lets it be under a huge TLB entry.
623 *
624 * If the user has requested isolnodes for a controller,
625 * though, there'll be no lowmem, so we just alloc_bootmem
626 * the memmap. There will be no percpu memory either.
627 */
628 if (__pfn_to_highbits(start) == 0) {
629 /* In low PAs, allocate via bootmem. */
630 unsigned long goal = 0;
631 node_memmap_pfn[i] =
632 alloc_bootmem_pfn(memmap_size, goal);
633 if (kdata_huge)
634 goal = PFN_PHYS(lowmem_end) - node_percpu[i];
635 if (node_percpu[i])
636 node_percpu_pfn[i] =
637 alloc_bootmem_pfn(node_percpu[i], goal);
638 } else if (cpu_isset(i, isolnodes)) {
639 node_memmap_pfn[i] = alloc_bootmem_pfn(memmap_size, 0);
640 BUG_ON(node_percpu[i] != 0);
641 } else {
642 /* In high PAs, just reserve some pages. */
643 node_memmap_pfn[i] = node_free_pfn[i];
644 node_free_pfn[i] += PFN_UP(memmap_size);
645 if (!kdata_huge) {
646 node_percpu_pfn[i] = node_free_pfn[i];
647 node_free_pfn[i] += PFN_UP(node_percpu[i]);
648 } else {
649 node_percpu_pfn[i] =
650 lowmem_end - PFN_UP(node_percpu[i]);
651 }
652 }
653
654#ifdef CONFIG_HIGHMEM
655 if (start > lowmem_end) {
656 zones_size[ZONE_NORMAL] = 0;
657 zones_size[ZONE_HIGHMEM] = end - start;
658 } else {
659 zones_size[ZONE_NORMAL] = lowmem_end - start;
660 zones_size[ZONE_HIGHMEM] = end - lowmem_end;
661 }
662#else
663 zones_size[ZONE_NORMAL] = end - start;
664#endif
665
666 /*
667 * Everyone shares node 0's bootmem allocator, but
668 * we use alloc_remap(), above, to put the actual
669 * struct page array on the individual controllers,
670 * which is most of the data that we actually care about.
671 * We can't place bootmem allocators on the other
672 * controllers since the bootmem allocator can only
673 * operate on 32-bit physical addresses.
674 */
675 NODE_DATA(i)->bdata = NODE_DATA(0)->bdata;
676
677 free_area_init_node(i, zones_size, start, NULL);
678 printk(KERN_DEBUG " DMA zone: %ld per-cpu pages\n",
679 PFN_UP(node_percpu[i]));
680
681 /* Track the type of memory on each node */
682 if (zones_size[ZONE_NORMAL])
683 node_set_state(i, N_NORMAL_MEMORY);
684#ifdef CONFIG_HIGHMEM
685 if (end != start)
686 node_set_state(i, N_HIGH_MEMORY);
687#endif
688
689 node_set_online(i);
690 }
691}
692
693#ifdef CONFIG_NUMA
694
695/* which logical CPUs are on which nodes */
696struct cpumask node_2_cpu_mask[MAX_NUMNODES] __write_once;
697EXPORT_SYMBOL(node_2_cpu_mask);
698
699/* which node each logical CPU is on */
700char cpu_2_node[NR_CPUS] __write_once __attribute__((aligned(L2_CACHE_BYTES)));
701EXPORT_SYMBOL(cpu_2_node);
702
703/* Return cpu_to_node() except for cpus not yet assigned, which return -1 */
704static int __init cpu_to_bound_node(int cpu, struct cpumask* unbound_cpus)
705{
706 if (!cpu_possible(cpu) || cpumask_test_cpu(cpu, unbound_cpus))
707 return -1;
708 else
709 return cpu_to_node(cpu);
710}
711
712/* Return number of immediately-adjacent tiles sharing the same NUMA node. */
713static int __init node_neighbors(int node, int cpu,
714 struct cpumask *unbound_cpus)
715{
716 int neighbors = 0;
717 int w = smp_width;
718 int h = smp_height;
719 int x = cpu % w;
720 int y = cpu / w;
721 if (x > 0 && cpu_to_bound_node(cpu-1, unbound_cpus) == node)
722 ++neighbors;
723 if (x < w-1 && cpu_to_bound_node(cpu+1, unbound_cpus) == node)
724 ++neighbors;
725 if (y > 0 && cpu_to_bound_node(cpu-w, unbound_cpus) == node)
726 ++neighbors;
727 if (y < h-1 && cpu_to_bound_node(cpu+w, unbound_cpus) == node)
728 ++neighbors;
729 return neighbors;
730}
731
732static void __init setup_numa_mapping(void)
733{
734 int distance[MAX_NUMNODES][NR_CPUS];
735 HV_Coord coord;
736 int cpu, node, cpus, i, x, y;
737 int num_nodes = num_online_nodes();
738 struct cpumask unbound_cpus;
739 nodemask_t default_nodes;
740
741 cpumask_clear(&unbound_cpus);
742
743 /* Get set of nodes we will use for defaults */
744 nodes_andnot(default_nodes, node_online_map, isolnodes);
745 if (nodes_empty(default_nodes)) {
746 BUG_ON(!node_isset(0, node_online_map));
747 printk("Forcing NUMA node zero available as a default node\n");
748 node_set(0, default_nodes);
749 }
750
751 /* Populate the distance[] array */
752 memset(distance, -1, sizeof(distance));
753 cpu = 0;
754 for (coord.y = 0; coord.y < smp_height; ++coord.y) {
755 for (coord.x = 0; coord.x < smp_width;
756 ++coord.x, ++cpu) {
757 BUG_ON(cpu >= nr_cpu_ids);
758 if (!cpu_possible(cpu)) {
759 cpu_2_node[cpu] = -1;
760 continue;
761 }
762 for_each_node_mask(node, default_nodes) {
763 HV_MemoryControllerInfo info =
764 hv_inquire_memory_controller(
765 coord, node_controller[node]);
766 distance[node][cpu] =
767 ABS(info.coord.x) + ABS(info.coord.y);
768 }
769 cpumask_set_cpu(cpu, &unbound_cpus);
770 }
771 }
772 cpus = cpu;
773
774 /*
775 * Round-robin through the NUMA nodes until all the cpus are
776 * assigned. We could be more clever here (e.g. create four
777 * sorted linked lists on the same set of cpu nodes, and pull
778 * off them in round-robin sequence, removing from all four
779 * lists each time) but given the relatively small numbers
780 * involved, O(n^2) seem OK for a one-time cost.
781 */
782 node = first_node(default_nodes);
783 while (!cpumask_empty(&unbound_cpus)) {
784 int best_cpu = -1;
785 int best_distance = INT_MAX;
786 for (cpu = 0; cpu < cpus; ++cpu) {
787 if (cpumask_test_cpu(cpu, &unbound_cpus)) {
788 /*
789 * Compute metric, which is how much
790 * closer the cpu is to this memory
791 * controller than the others, shifted
792 * up, and then the number of
793 * neighbors already in the node as an
794 * epsilon adjustment to try to keep
795 * the nodes compact.
796 */
797 int d = distance[node][cpu] * num_nodes;
798 for_each_node_mask(i, default_nodes) {
799 if (i != node)
800 d -= distance[i][cpu];
801 }
802 d *= 8; /* allow space for epsilon */
803 d -= node_neighbors(node, cpu, &unbound_cpus);
804 if (d < best_distance) {
805 best_cpu = cpu;
806 best_distance = d;
807 }
808 }
809 }
810 BUG_ON(best_cpu < 0);
811 cpumask_set_cpu(best_cpu, &node_2_cpu_mask[node]);
812 cpu_2_node[best_cpu] = node;
813 cpumask_clear_cpu(best_cpu, &unbound_cpus);
814 node = next_node(node, default_nodes);
815 if (node == MAX_NUMNODES)
816 node = first_node(default_nodes);
817 }
818
819 /* Print out node assignments and set defaults for disabled cpus */
820 cpu = 0;
821 for (y = 0; y < smp_height; ++y) {
822 printk(KERN_DEBUG "NUMA cpu-to-node row %d:", y);
823 for (x = 0; x < smp_width; ++x, ++cpu) {
824 if (cpu_to_node(cpu) < 0) {
825 printk(" -");
826 cpu_2_node[cpu] = first_node(default_nodes);
827 } else {
828 printk(" %d", cpu_to_node(cpu));
829 }
830 }
831 printk("\n");
832 }
833}
834
835static struct cpu cpu_devices[NR_CPUS];
836
837static int __init topology_init(void)
838{
839 int i;
840
841 for_each_online_node(i)
842 register_one_node(i);
843
844 for_each_present_cpu(i)
845 register_cpu(&cpu_devices[i], i);
846
847 return 0;
848}
849
850subsys_initcall(topology_init);
851
852#else /* !CONFIG_NUMA */
853
854#define setup_numa_mapping() do { } while (0)
855
856#endif /* CONFIG_NUMA */
857
858/**
859 * setup_mpls() - Allow the user-space code to access various SPRs.
860 *
861 * Also called from online_secondary().
862 */
863void __cpuinit setup_mpls(void)
864{
865 /* Allow asynchronous TLB interrupts. */
866#if CHIP_HAS_TILE_DMA()
867 raw_local_irq_unmask(INT_DMATLB_MISS);
868 raw_local_irq_unmask(INT_DMATLB_ACCESS);
869#endif
870#if CHIP_HAS_SN_PROC()
871 raw_local_irq_unmask(INT_SNITLB_MISS);
872#endif
873
874 /*
875 * Allow user access to many generic SPRs, like the cycle
876 * counter, PASS/FAIL/DONE, INTERRUPT_CRITICAL_SECTION, etc.
877 */
878 __insn_mtspr(SPR_MPL_WORLD_ACCESS_SET_0, 1);
879
880#if CHIP_HAS_SN()
881 /* Static network is not restricted. */
882 __insn_mtspr(SPR_MPL_SN_ACCESS_SET_0, 1);
883#endif
884#if CHIP_HAS_SN_PROC()
885 __insn_mtspr(SPR_MPL_SN_NOTIFY_SET_0, 1);
886 __insn_mtspr(SPR_MPL_SN_CPL_SET_0, 1);
887#endif
888
889 /*
890 * Set the MPL for interrupt control 0 to user level.
891 * This includes access to the SYSTEM_SAVE and EX_CONTEXT SPRs,
892 * as well as the PL 0 interrupt mask.
893 */
894 __insn_mtspr(SPR_MPL_INTCTRL_0_SET_0, 1);
895}
896
897static int __initdata set_initramfs_file;
898static char __initdata initramfs_file[128] = "initramfs.cpio.gz";
899
900static int __init setup_initramfs_file(char *str)
901{
902 if (str == NULL)
903 return -EINVAL;
904 strncpy(initramfs_file, str, sizeof(initramfs_file) - 1);
905 set_initramfs_file = 1;
906
907 return 0;
908}
909early_param("initramfs_file", setup_initramfs_file);
910
911/*
912 * We look for an additional "initramfs.cpio.gz" file in the hvfs.
913 * If there is one, we allocate some memory for it and it will be
914 * unpacked to the initramfs after any built-in initramfs_data.
915 */
916static void __init load_hv_initrd(void)
917{
918 HV_FS_StatInfo stat;
919 int fd, rc;
920 void *initrd;
921
922 fd = hv_fs_findfile((HV_VirtAddr) initramfs_file);
923 if (fd == HV_ENOENT) {
924 if (set_initramfs_file)
925 printk("No such hvfs initramfs file '%s'\n",
926 initramfs_file);
927 return;
928 }
929 BUG_ON(fd < 0);
930 stat = hv_fs_fstat(fd);
931 BUG_ON(stat.size < 0);
932 if (stat.flags & HV_FS_ISDIR) {
933 printk("Ignoring hvfs file '%s': it's a directory.\n",
934 initramfs_file);
935 return;
936 }
937 initrd = alloc_bootmem_pages(stat.size);
938 rc = hv_fs_pread(fd, (HV_VirtAddr) initrd, stat.size, 0);
939 if (rc != stat.size) {
940 printk("Error reading %d bytes from hvfs file '%s': %d\n",
941 stat.size, initramfs_file, rc);
942 free_bootmem((unsigned long) initrd, stat.size);
943 return;
944 }
945 initrd_start = (unsigned long) initrd;
946 initrd_end = initrd_start + stat.size;
947}
948
949void __init free_initrd_mem(unsigned long begin, unsigned long end)
950{
951 free_bootmem(begin, end - begin);
952}
953
954static void __init validate_hv(void)
955{
956 /*
957 * It may already be too late, but let's check our built-in
958 * configuration against what the hypervisor is providing.
959 */
960 unsigned long glue_size = hv_sysconf(HV_SYSCONF_GLUE_SIZE);
961 int hv_page_size = hv_sysconf(HV_SYSCONF_PAGE_SIZE_SMALL);
962 int hv_hpage_size = hv_sysconf(HV_SYSCONF_PAGE_SIZE_LARGE);
963 HV_ASIDRange asid_range;
964
965#ifndef CONFIG_SMP
966 HV_Topology topology = hv_inquire_topology();
967 BUG_ON(topology.coord.x != 0 || topology.coord.y != 0);
968 if (topology.width != 1 || topology.height != 1) {
969 printk("Warning: booting UP kernel on %dx%d grid;"
970 " will ignore all but first tile.\n",
971 topology.width, topology.height);
972 }
973#endif
974
975 if (PAGE_OFFSET + HV_GLUE_START_CPA + glue_size > (unsigned long)_text)
976 early_panic("Hypervisor glue size %ld is too big!\n",
977 glue_size);
978 if (hv_page_size != PAGE_SIZE)
979 early_panic("Hypervisor page size %#x != our %#lx\n",
980 hv_page_size, PAGE_SIZE);
981 if (hv_hpage_size != HPAGE_SIZE)
982 early_panic("Hypervisor huge page size %#x != our %#lx\n",
983 hv_hpage_size, HPAGE_SIZE);
984
985#ifdef CONFIG_SMP
986 /*
987 * Some hypervisor APIs take a pointer to a bitmap array
988 * whose size is at least the number of cpus on the chip.
989 * We use a struct cpumask for this, so it must be big enough.
990 */
991 if ((smp_height * smp_width) > nr_cpu_ids)
992 early_panic("Hypervisor %d x %d grid too big for Linux"
993 " NR_CPUS %d\n", smp_height, smp_width,
994 nr_cpu_ids);
995#endif
996
997 /*
998 * Check that we're using allowed ASIDs, and initialize the
999 * various asid variables to their appropriate initial states.
1000 */
1001 asid_range = hv_inquire_asid(0);
1002 __get_cpu_var(current_asid) = min_asid = asid_range.start;
1003 max_asid = asid_range.start + asid_range.size - 1;
1004
1005 if (hv_confstr(HV_CONFSTR_CHIP_MODEL, (HV_VirtAddr)chip_model,
1006 sizeof(chip_model)) < 0) {
1007 printk("Warning: HV_CONFSTR_CHIP_MODEL not available\n");
1008 strlcpy(chip_model, "unknown", sizeof(chip_model));
1009 }
1010}
1011
1012static void __init validate_va(void)
1013{
1014#ifndef __tilegx__ /* FIXME: GX: probably some validation relevant here */
1015 /*
1016 * Similarly, make sure we're only using allowed VAs.
1017 * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_INTRPT,
1018 * and 0 .. KERNEL_HIGH_VADDR.
1019 * In addition, make sure we CAN'T use the end of memory, since
1020 * we use the last chunk of each pgd for the pgd_list.
1021 */
1022 int i, fc_fd_ok = 0;
1023 unsigned long max_va = 0;
1024 unsigned long list_va =
1025 ((PGD_LIST_OFFSET / sizeof(pgd_t)) << PGDIR_SHIFT);
1026
1027 for (i = 0; ; ++i) {
1028 HV_VirtAddrRange range = hv_inquire_virtual(i);
1029 if (range.size == 0)
1030 break;
1031 if (range.start <= MEM_USER_INTRPT &&
1032 range.start + range.size >= MEM_HV_INTRPT)
1033 fc_fd_ok = 1;
1034 if (range.start == 0)
1035 max_va = range.size;
1036 BUG_ON(range.start + range.size > list_va);
1037 }
1038 if (!fc_fd_ok)
1039 early_panic("Hypervisor not configured for VAs 0xfc/0xfd\n");
1040 if (max_va == 0)
1041 early_panic("Hypervisor not configured for low VAs\n");
1042 if (max_va < KERNEL_HIGH_VADDR)
1043 early_panic("Hypervisor max VA %#lx smaller than %#lx\n",
1044 max_va, KERNEL_HIGH_VADDR);
1045
1046 /* Kernel PCs must have their high bit set; see intvec.S. */
1047 if ((long)VMALLOC_START >= 0)
1048 early_panic(
1049 "Linux VMALLOC region below the 2GB line (%#lx)!\n"
1050 "Reconfigure the kernel with fewer NR_HUGE_VMAPS\n"
1051 "or smaller VMALLOC_RESERVE.\n",
1052 VMALLOC_START);
1053#endif
1054}
1055
1056/*
1057 * cpu_lotar_map lists all the cpus that are valid for the supervisor
1058 * to cache data on at a page level, i.e. what cpus can be placed in
1059 * the LOTAR field of a PTE. It is equivalent to the set of possible
1060 * cpus plus any other cpus that are willing to share their cache.
1061 * It is set by hv_inquire_tiles(HV_INQ_TILES_LOTAR).
1062 */
1063struct cpumask __write_once cpu_lotar_map;
1064EXPORT_SYMBOL(cpu_lotar_map);
1065
1066#if CHIP_HAS_CBOX_HOME_MAP()
1067/*
1068 * hash_for_home_map lists all the tiles that hash-for-home data
1069 * will be cached on. Note that this may includes tiles that are not
1070 * valid for this supervisor to use otherwise (e.g. if a hypervisor
1071 * device is being shared between multiple supervisors).
1072 * It is set by hv_inquire_tiles(HV_INQ_TILES_HFH_CACHE).
1073 */
1074struct cpumask hash_for_home_map;
1075EXPORT_SYMBOL(hash_for_home_map);
1076#endif
1077
1078/*
1079 * cpu_cacheable_map lists all the cpus whose caches the hypervisor can
1080 * flush on our behalf. It is set to cpu_possible_map OR'ed with
1081 * hash_for_home_map, and it is what should be passed to
1082 * hv_flush_remote() to flush all caches. Note that if there are
1083 * dedicated hypervisor driver tiles that have authorized use of their
1084 * cache, those tiles will only appear in cpu_lotar_map, NOT in
1085 * cpu_cacheable_map, as they are a special case.
1086 */
1087struct cpumask __write_once cpu_cacheable_map;
1088EXPORT_SYMBOL(cpu_cacheable_map);
1089
1090static __initdata struct cpumask disabled_map;
1091
1092static int __init disabled_cpus(char *str)
1093{
1094 int boot_cpu = smp_processor_id();
1095
1096 if (str == NULL || cpulist_parse_crop(str, &disabled_map) != 0)
1097 return -EINVAL;
1098 if (cpumask_test_cpu(boot_cpu, &disabled_map)) {
1099 printk("disabled_cpus: can't disable boot cpu %d\n", boot_cpu);
1100 cpumask_clear_cpu(boot_cpu, &disabled_map);
1101 }
1102 return 0;
1103}
1104
1105early_param("disabled_cpus", disabled_cpus);
1106
1107void __init print_disabled_cpus()
1108{
1109 if (!cpumask_empty(&disabled_map)) {
1110 char buf[100];
1111 cpulist_scnprintf(buf, sizeof(buf), &disabled_map);
1112 printk(KERN_INFO "CPUs not available for Linux: %s\n", buf);
1113 }
1114}
1115
1116static void __init setup_cpu_maps(void)
1117{
1118 struct cpumask hv_disabled_map, cpu_possible_init;
1119 int boot_cpu = smp_processor_id();
1120 int cpus, i, rc;
1121
1122 /* Learn which cpus are allowed by the hypervisor. */
1123 rc = hv_inquire_tiles(HV_INQ_TILES_AVAIL,
1124 (HV_VirtAddr) cpumask_bits(&cpu_possible_init),
1125 sizeof(cpu_cacheable_map));
1126 if (rc < 0)
1127 early_panic("hv_inquire_tiles(AVAIL) failed: rc %d\n", rc);
1128 if (!cpumask_test_cpu(boot_cpu, &cpu_possible_init))
1129 early_panic("Boot CPU %d disabled by hypervisor!\n", boot_cpu);
1130
1131 /* Compute the cpus disabled by the hvconfig file. */
1132 cpumask_complement(&hv_disabled_map, &cpu_possible_init);
1133
1134 /* Include them with the cpus disabled by "disabled_cpus". */
1135 cpumask_or(&disabled_map, &disabled_map, &hv_disabled_map);
1136
1137 /*
1138 * Disable every cpu after "setup_max_cpus". But don't mark
1139 * as disabled the cpus that are outside of our initial rectangle,
1140 * since that turns out to be confusing.
1141 */
1142 cpus = 1; /* this cpu */
1143 cpumask_set_cpu(boot_cpu, &disabled_map); /* ignore this cpu */
1144 for (i = 0; cpus < setup_max_cpus; ++i)
1145 if (!cpumask_test_cpu(i, &disabled_map))
1146 ++cpus;
1147 for (; i < smp_height * smp_width; ++i)
1148 cpumask_set_cpu(i, &disabled_map);
1149 cpumask_clear_cpu(boot_cpu, &disabled_map); /* reset this cpu */
1150 for (i = smp_height * smp_width; i < NR_CPUS; ++i)
1151 cpumask_clear_cpu(i, &disabled_map);
1152
1153 /*
1154 * Setup cpu_possible map as every cpu allocated to us, minus
1155 * the results of any "disabled_cpus" settings.
1156 */
1157 cpumask_andnot(&cpu_possible_init, &cpu_possible_init, &disabled_map);
1158 init_cpu_possible(&cpu_possible_init);
1159
1160 /* Learn which cpus are valid for LOTAR caching. */
1161 rc = hv_inquire_tiles(HV_INQ_TILES_LOTAR,
1162 (HV_VirtAddr) cpumask_bits(&cpu_lotar_map),
1163 sizeof(cpu_lotar_map));
1164 if (rc < 0) {
1165 printk("warning: no HV_INQ_TILES_LOTAR; using AVAIL\n");
1166 cpu_lotar_map = cpu_possible_map;
1167 }
1168
1169#if CHIP_HAS_CBOX_HOME_MAP()
1170 /* Retrieve set of CPUs used for hash-for-home caching */
1171 rc = hv_inquire_tiles(HV_INQ_TILES_HFH_CACHE,
1172 (HV_VirtAddr) hash_for_home_map.bits,
1173 sizeof(hash_for_home_map));
1174 if (rc < 0)
1175 early_panic("hv_inquire_tiles(HFH_CACHE) failed: rc %d\n", rc);
1176 cpumask_or(&cpu_cacheable_map, &cpu_possible_map, &hash_for_home_map);
1177#else
1178 cpu_cacheable_map = cpu_possible_map;
1179#endif
1180}
1181
1182
1183static int __init dataplane(char *str)
1184{
1185 printk("WARNING: dataplane support disabled in this kernel\n");
1186 return 0;
1187}
1188
1189early_param("dataplane", dataplane);
1190
1191#ifdef CONFIG_CMDLINE_BOOL
1192static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
1193#endif
1194
1195void __init setup_arch(char **cmdline_p)
1196{
1197 int len;
1198
1199#if defined(CONFIG_CMDLINE_BOOL) && defined(CONFIG_CMDLINE_OVERRIDE)
1200 len = hv_get_command_line((HV_VirtAddr) boot_command_line,
1201 COMMAND_LINE_SIZE);
1202 if (boot_command_line[0])
1203 printk("WARNING: ignoring dynamic command line \"%s\"\n",
1204 boot_command_line);
1205 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
1206#else
1207 char *hv_cmdline;
1208#if defined(CONFIG_CMDLINE_BOOL)
1209 if (builtin_cmdline[0]) {
1210 int builtin_len = strlcpy(boot_command_line, builtin_cmdline,
1211 COMMAND_LINE_SIZE);
1212 if (builtin_len < COMMAND_LINE_SIZE-1)
1213 boot_command_line[builtin_len++] = ' ';
1214 hv_cmdline = &boot_command_line[builtin_len];
1215 len = COMMAND_LINE_SIZE - builtin_len;
1216 } else
1217#endif
1218 {
1219 hv_cmdline = boot_command_line;
1220 len = COMMAND_LINE_SIZE;
1221 }
1222 len = hv_get_command_line((HV_VirtAddr) hv_cmdline, len);
1223 if (len < 0 || len > COMMAND_LINE_SIZE)
1224 early_panic("hv_get_command_line failed: %d\n", len);
1225#endif
1226
1227 *cmdline_p = boot_command_line;
1228
1229 /* Set disabled_map and setup_max_cpus very early */
1230 parse_early_param();
1231
1232 /* Make sure the kernel is compatible with the hypervisor. */
1233 validate_hv();
1234 validate_va();
1235
1236 setup_cpu_maps();
1237
1238
1239#ifdef CONFIG_PCI
1240 /*
1241 * Initialize the PCI structures. This is done before memory
1242 * setup so that we know whether or not a pci_reserve region
1243 * is necessary.
1244 */
1245 if (tile_pci_init() == 0)
1246 pci_reserve_mb = 0;
1247
1248 /* PCI systems reserve a region just below 4GB for mapping iomem. */
1249 pci_reserve_end_pfn = (1 << (32 - PAGE_SHIFT));
1250 pci_reserve_start_pfn = pci_reserve_end_pfn -
1251 (pci_reserve_mb << (20 - PAGE_SHIFT));
1252#endif
1253
1254 init_mm.start_code = (unsigned long) _text;
1255 init_mm.end_code = (unsigned long) _etext;
1256 init_mm.end_data = (unsigned long) _edata;
1257 init_mm.brk = (unsigned long) _end;
1258
1259 setup_memory();
1260 store_permanent_mappings();
1261 setup_bootmem_allocator();
1262
1263 /*
1264 * NOTE: before this point _nobody_ is allowed to allocate
1265 * any memory using the bootmem allocator.
1266 */
1267
1268 paging_init();
1269 setup_numa_mapping();
1270 zone_sizes_init();
1271 set_page_homes();
1272 setup_mpls();
1273 setup_clock();
1274 load_hv_initrd();
1275}
1276
1277
1278/*
1279 * Set up per-cpu memory.
1280 */
1281
1282unsigned long __per_cpu_offset[NR_CPUS] __write_once;
1283EXPORT_SYMBOL(__per_cpu_offset);
1284
1285static size_t __initdata pfn_offset[MAX_NUMNODES] = { 0 };
1286static unsigned long __initdata percpu_pfn[NR_CPUS] = { 0 };
1287
1288/*
1289 * As the percpu code allocates pages, we return the pages from the
1290 * end of the node for the specified cpu.
1291 */
1292static void *__init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
1293{
1294 int nid = cpu_to_node(cpu);
1295 unsigned long pfn = node_percpu_pfn[nid] + pfn_offset[nid];
1296
1297 BUG_ON(size % PAGE_SIZE != 0);
1298 pfn_offset[nid] += size / PAGE_SIZE;
1299 if (percpu_pfn[cpu] == 0)
1300 percpu_pfn[cpu] = pfn;
1301 return pfn_to_kaddr(pfn);
1302}
1303
1304/*
1305 * Pages reserved for percpu memory are not freeable, and in any case we are
1306 * on a short path to panic() in setup_per_cpu_area() at this point anyway.
1307 */
1308static void __init pcpu_fc_free(void *ptr, size_t size)
1309{
1310}
1311
1312/*
1313 * Set up vmalloc page tables using bootmem for the percpu code.
1314 */
1315static void __init pcpu_fc_populate_pte(unsigned long addr)
1316{
1317 pgd_t *pgd;
1318 pud_t *pud;
1319 pmd_t *pmd;
1320 pte_t *pte;
1321
1322 BUG_ON(pgd_addr_invalid(addr));
1323
1324 pgd = swapper_pg_dir + pgd_index(addr);
1325 pud = pud_offset(pgd, addr);
1326 BUG_ON(!pud_present(*pud));
1327 pmd = pmd_offset(pud, addr);
1328 if (pmd_present(*pmd)) {
1329 BUG_ON(pmd_huge_page(*pmd));
1330 } else {
1331 pte = __alloc_bootmem(L2_KERNEL_PGTABLE_SIZE,
1332 HV_PAGE_TABLE_ALIGN, 0);
1333 pmd_populate_kernel(&init_mm, pmd, pte);
1334 }
1335}
1336
1337void __init setup_per_cpu_areas(void)
1338{
1339 struct page *pg;
1340 unsigned long delta, pfn, lowmem_va;
1341 unsigned long size = percpu_size();
1342 char *ptr;
1343 int rc, cpu, i;
1344
1345 rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, pcpu_fc_alloc,
1346 pcpu_fc_free, pcpu_fc_populate_pte);
1347 if (rc < 0)
1348 panic("Cannot initialize percpu area (err=%d)", rc);
1349
1350 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
1351 for_each_possible_cpu(cpu) {
1352 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
1353
1354 /* finv the copy out of cache so we can change homecache */
1355 ptr = pcpu_base_addr + pcpu_unit_offsets[cpu];
1356 __finv_buffer(ptr, size);
1357 pfn = percpu_pfn[cpu];
1358
1359 /* Rewrite the page tables to cache on that cpu */
1360 pg = pfn_to_page(pfn);
1361 for (i = 0; i < size; i += PAGE_SIZE, ++pfn, ++pg) {
1362
1363 /* Update the vmalloc mapping and page home. */
1364 pte_t *ptep =
1365 virt_to_pte(NULL, (unsigned long)ptr + i);
1366 pte_t pte = *ptep;
1367 BUG_ON(pfn != pte_pfn(pte));
1368 pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_TILE_L3);
1369 pte = set_remote_cache_cpu(pte, cpu);
1370 set_pte(ptep, pte);
1371
1372 /* Update the lowmem mapping for consistency. */
1373 lowmem_va = (unsigned long)pfn_to_kaddr(pfn);
1374 ptep = virt_to_pte(NULL, lowmem_va);
1375 if (pte_huge(*ptep)) {
1376 printk(KERN_DEBUG "early shatter of huge page"
1377 " at %#lx\n", lowmem_va);
1378 shatter_pmd((pmd_t *)ptep);
1379 ptep = virt_to_pte(NULL, lowmem_va);
1380 BUG_ON(pte_huge(*ptep));
1381 }
1382 BUG_ON(pfn != pte_pfn(*ptep));
1383 set_pte(ptep, pte);
1384 }
1385 }
1386
1387 /* Set our thread pointer appropriately. */
1388 set_my_cpu_offset(__per_cpu_offset[smp_processor_id()]);
1389
1390 /* Make sure the finv's have completed. */
1391 mb_incoherent();
1392
1393 /* Flush the TLB so we reference it properly from here on out. */
1394 local_flush_tlb_all();
1395}
1396
1397static struct resource data_resource = {
1398 .name = "Kernel data",
1399 .start = 0,
1400 .end = 0,
1401 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1402};
1403
1404static struct resource code_resource = {
1405 .name = "Kernel code",
1406 .start = 0,
1407 .end = 0,
1408 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1409};
1410
1411/*
1412 * We reserve all resources above 4GB so that PCI won't try to put
1413 * mappings above 4GB; the standard allows that for some devices but
1414 * the probing code trunates values to 32 bits.
1415 */
1416#ifdef CONFIG_PCI
1417static struct resource* __init
1418insert_non_bus_resource(void)
1419{
1420 struct resource *res =
1421 kzalloc(sizeof(struct resource), GFP_ATOMIC);
1422 res->name = "Non-Bus Physical Address Space";
1423 res->start = (1ULL << 32);
1424 res->end = -1LL;
1425 res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
1426 if (insert_resource(&iomem_resource, res)) {
1427 kfree(res);
1428 return NULL;
1429 }
1430 return res;
1431}
1432#endif
1433
1434static struct resource* __init
1435insert_ram_resource(u64 start_pfn, u64 end_pfn)
1436{
1437 struct resource *res =
1438 kzalloc(sizeof(struct resource), GFP_ATOMIC);
1439 res->name = "System RAM";
1440 res->start = start_pfn << PAGE_SHIFT;
1441 res->end = (end_pfn << PAGE_SHIFT) - 1;
1442 res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
1443 if (insert_resource(&iomem_resource, res)) {
1444 kfree(res);
1445 return NULL;
1446 }
1447 return res;
1448}
1449
1450/*
1451 * Request address space for all standard resources
1452 *
1453 * If the system includes PCI root complex drivers, we need to create
1454 * a window just below 4GB where PCI BARs can be mapped.
1455 */
1456static int __init request_standard_resources(void)
1457{
1458 int i;
1459 enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
1460
1461 iomem_resource.end = -1LL;
1462#ifdef CONFIG_PCI
1463 insert_non_bus_resource();
1464#endif
1465
1466 for_each_online_node(i) {
1467 u64 start_pfn = node_start_pfn[i];
1468 u64 end_pfn = node_end_pfn[i];
1469
1470#ifdef CONFIG_PCI
1471 if (start_pfn <= pci_reserve_start_pfn &&
1472 end_pfn > pci_reserve_start_pfn) {
1473 if (end_pfn > pci_reserve_end_pfn)
1474 insert_ram_resource(pci_reserve_end_pfn,
1475 end_pfn);
1476 end_pfn = pci_reserve_start_pfn;
1477 }
1478#endif
1479 insert_ram_resource(start_pfn, end_pfn);
1480 }
1481
1482 code_resource.start = __pa(_text - CODE_DELTA);
1483 code_resource.end = __pa(_etext - CODE_DELTA)-1;
1484 data_resource.start = __pa(_sdata);
1485 data_resource.end = __pa(_end)-1;
1486
1487 insert_resource(&iomem_resource, &code_resource);
1488 insert_resource(&iomem_resource, &data_resource);
1489
1490#ifdef CONFIG_KEXEC
1491 insert_resource(&iomem_resource, &crashk_res);
1492#endif
1493
1494 return 0;
1495}
1496
1497subsys_initcall(request_standard_resources);