aboutsummaryrefslogtreecommitdiffstats
path: root/arch/sparc64
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2008-04-23 08:40:25 -0400
committerDavid S. Miller <davem@davemloft.net>2008-04-24 02:32:17 -0400
commit919ee677b656c52c5f86d3d916786891220d5452 (patch)
treedd1202209945b4c2529af074effdb7300edda684 /arch/sparc64
parent1f261ef53ba06658dfeb5a9c3007d0ad1b85cadf (diff)
[SPARC64]: Add NUMA support.
Currently there is only code to parse NUMA attributes on sun4v/niagara systems, but later on we will add such parsing for older systems. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'arch/sparc64')
-rw-r--r--arch/sparc64/Kconfig20
-rw-r--r--arch/sparc64/defconfig99
-rw-r--r--arch/sparc64/kernel/sysfs.c12
-rw-r--r--arch/sparc64/mm/init.c796
4 files changed, 792 insertions, 135 deletions
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index df3eacb5ca15..8acc5cc38621 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -250,6 +250,26 @@ endchoice
250 250
251endmenu 251endmenu
252 252
253config NUMA
254 bool "NUMA support"
255
256config NODES_SHIFT
257 int
258 default "4"
259 depends on NEED_MULTIPLE_NODES
260
261# Some NUMA nodes have memory ranges that span
262# other nodes. Even though a pfn is valid and
263# between a node's start and end pfns, it may not
264# reside on that node. See memmap_init_zone()
265# for details.
266config NODES_SPAN_OTHER_NODES
267 def_bool y
268 depends on NEED_MULTIPLE_NODES
269
270config ARCH_POPULATES_NODE_MAP
271 def_bool y
272
253config ARCH_SELECT_MEMORY_MODEL 273config ARCH_SELECT_MEMORY_MODEL
254 def_bool y 274 def_bool y
255 275
diff --git a/arch/sparc64/defconfig b/arch/sparc64/defconfig
index e1835868ad36..92f79680f70d 100644
--- a/arch/sparc64/defconfig
+++ b/arch/sparc64/defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.25 3# Linux kernel version: 2.6.25-numa
4# Sun Apr 20 01:33:21 2008 4# Wed Apr 23 04:49:08 2008
5# 5#
6CONFIG_SPARC=y 6CONFIG_SPARC=y
7CONFIG_SPARC64=y 7CONFIG_SPARC64=y
@@ -152,6 +152,8 @@ CONFIG_GENERIC_CALIBRATE_DELAY=y
152CONFIG_HUGETLB_PAGE_SIZE_4MB=y 152CONFIG_HUGETLB_PAGE_SIZE_4MB=y
153# CONFIG_HUGETLB_PAGE_SIZE_512K is not set 153# CONFIG_HUGETLB_PAGE_SIZE_512K is not set
154# CONFIG_HUGETLB_PAGE_SIZE_64K is not set 154# CONFIG_HUGETLB_PAGE_SIZE_64K is not set
155# CONFIG_NUMA is not set
156CONFIG_ARCH_POPULATES_NODE_MAP=y
155CONFIG_ARCH_SELECT_MEMORY_MODEL=y 157CONFIG_ARCH_SELECT_MEMORY_MODEL=y
156CONFIG_ARCH_SPARSEMEM_ENABLE=y 158CONFIG_ARCH_SPARSEMEM_ENABLE=y
157CONFIG_ARCH_SPARSEMEM_DEFAULT=y 159CONFIG_ARCH_SPARSEMEM_DEFAULT=y
@@ -787,7 +789,6 @@ CONFIG_I2C_ALGOBIT=y
787# CONFIG_SENSORS_PCF8574 is not set 789# CONFIG_SENSORS_PCF8574 is not set
788# CONFIG_PCF8575 is not set 790# CONFIG_PCF8575 is not set
789# CONFIG_SENSORS_PCF8591 is not set 791# CONFIG_SENSORS_PCF8591 is not set
790# CONFIG_TPS65010 is not set
791# CONFIG_SENSORS_MAX6875 is not set 792# CONFIG_SENSORS_MAX6875 is not set
792# CONFIG_SENSORS_TSL2550 is not set 793# CONFIG_SENSORS_TSL2550 is not set
793# CONFIG_I2C_DEBUG_CORE is not set 794# CONFIG_I2C_DEBUG_CORE is not set
@@ -869,6 +870,7 @@ CONFIG_SSB_POSSIBLE=y
869# Multifunction device drivers 870# Multifunction device drivers
870# 871#
871# CONFIG_MFD_SM501 is not set 872# CONFIG_MFD_SM501 is not set
873# CONFIG_HTC_PASIC3 is not set
872 874
873# 875#
874# Multimedia devices 876# Multimedia devices
@@ -1219,10 +1221,6 @@ CONFIG_USB_STORAGE=m
1219# CONFIG_NEW_LEDS is not set 1221# CONFIG_NEW_LEDS is not set
1220# CONFIG_INFINIBAND is not set 1222# CONFIG_INFINIBAND is not set
1221# CONFIG_RTC_CLASS is not set 1223# CONFIG_RTC_CLASS is not set
1222
1223#
1224# Userspace I/O
1225#
1226# CONFIG_UIO is not set 1224# CONFIG_UIO is not set
1227 1225
1228# 1226#
@@ -1399,6 +1397,7 @@ CONFIG_SCHEDSTATS=y
1399CONFIG_DEBUG_BUGVERBOSE=y 1397CONFIG_DEBUG_BUGVERBOSE=y
1400# CONFIG_DEBUG_INFO is not set 1398# CONFIG_DEBUG_INFO is not set
1401# CONFIG_DEBUG_VM is not set 1399# CONFIG_DEBUG_VM is not set
1400# CONFIG_DEBUG_WRITECOUNT is not set
1402# CONFIG_DEBUG_LIST is not set 1401# CONFIG_DEBUG_LIST is not set
1403# CONFIG_DEBUG_SG is not set 1402# CONFIG_DEBUG_SG is not set
1404# CONFIG_BOOT_PRINTK_DELAY is not set 1403# CONFIG_BOOT_PRINTK_DELAY is not set
@@ -1425,53 +1424,82 @@ CONFIG_ASYNC_CORE=m
1425CONFIG_ASYNC_MEMCPY=m 1424CONFIG_ASYNC_MEMCPY=m
1426CONFIG_ASYNC_XOR=m 1425CONFIG_ASYNC_XOR=m
1427CONFIG_CRYPTO=y 1426CONFIG_CRYPTO=y
1427
1428#
1429# Crypto core or helper
1430#
1428CONFIG_CRYPTO_ALGAPI=y 1431CONFIG_CRYPTO_ALGAPI=y
1429CONFIG_CRYPTO_AEAD=y 1432CONFIG_CRYPTO_AEAD=y
1430CONFIG_CRYPTO_BLKCIPHER=y 1433CONFIG_CRYPTO_BLKCIPHER=y
1431# CONFIG_CRYPTO_SEQIV is not set
1432CONFIG_CRYPTO_HASH=y 1434CONFIG_CRYPTO_HASH=y
1433CONFIG_CRYPTO_MANAGER=y 1435CONFIG_CRYPTO_MANAGER=y
1436CONFIG_CRYPTO_GF128MUL=m
1437CONFIG_CRYPTO_NULL=m
1438# CONFIG_CRYPTO_CRYPTD is not set
1439CONFIG_CRYPTO_AUTHENC=y
1440CONFIG_CRYPTO_TEST=m
1441
1442#
1443# Authenticated Encryption with Associated Data
1444#
1445# CONFIG_CRYPTO_CCM is not set
1446# CONFIG_CRYPTO_GCM is not set
1447# CONFIG_CRYPTO_SEQIV is not set
1448
1449#
1450# Block modes
1451#
1452CONFIG_CRYPTO_CBC=y
1453# CONFIG_CRYPTO_CTR is not set
1454# CONFIG_CRYPTO_CTS is not set
1455CONFIG_CRYPTO_ECB=m
1456CONFIG_CRYPTO_LRW=m
1457CONFIG_CRYPTO_PCBC=m
1458CONFIG_CRYPTO_XTS=m
1459
1460#
1461# Hash modes
1462#
1434CONFIG_CRYPTO_HMAC=y 1463CONFIG_CRYPTO_HMAC=y
1435CONFIG_CRYPTO_XCBC=y 1464CONFIG_CRYPTO_XCBC=y
1436CONFIG_CRYPTO_NULL=m 1465
1466#
1467# Digest
1468#
1469CONFIG_CRYPTO_CRC32C=m
1437CONFIG_CRYPTO_MD4=y 1470CONFIG_CRYPTO_MD4=y
1438CONFIG_CRYPTO_MD5=y 1471CONFIG_CRYPTO_MD5=y
1472CONFIG_CRYPTO_MICHAEL_MIC=m
1439CONFIG_CRYPTO_SHA1=y 1473CONFIG_CRYPTO_SHA1=y
1440CONFIG_CRYPTO_SHA256=m 1474CONFIG_CRYPTO_SHA256=m
1441CONFIG_CRYPTO_SHA512=m 1475CONFIG_CRYPTO_SHA512=m
1442CONFIG_CRYPTO_WP512=m
1443CONFIG_CRYPTO_TGR192=m 1476CONFIG_CRYPTO_TGR192=m
1444CONFIG_CRYPTO_GF128MUL=m 1477CONFIG_CRYPTO_WP512=m
1445CONFIG_CRYPTO_ECB=m 1478
1446CONFIG_CRYPTO_CBC=y 1479#
1447CONFIG_CRYPTO_PCBC=m 1480# Ciphers
1448CONFIG_CRYPTO_LRW=m 1481#
1449CONFIG_CRYPTO_XTS=m
1450# CONFIG_CRYPTO_CTR is not set
1451# CONFIG_CRYPTO_GCM is not set
1452# CONFIG_CRYPTO_CCM is not set
1453# CONFIG_CRYPTO_CRYPTD is not set
1454CONFIG_CRYPTO_DES=y
1455CONFIG_CRYPTO_FCRYPT=m
1456CONFIG_CRYPTO_BLOWFISH=m
1457CONFIG_CRYPTO_TWOFISH=m
1458CONFIG_CRYPTO_TWOFISH_COMMON=m
1459CONFIG_CRYPTO_SERPENT=m
1460CONFIG_CRYPTO_AES=m 1482CONFIG_CRYPTO_AES=m
1483CONFIG_CRYPTO_ANUBIS=m
1484CONFIG_CRYPTO_ARC4=m
1485CONFIG_CRYPTO_BLOWFISH=m
1486CONFIG_CRYPTO_CAMELLIA=m
1461CONFIG_CRYPTO_CAST5=m 1487CONFIG_CRYPTO_CAST5=m
1462CONFIG_CRYPTO_CAST6=m 1488CONFIG_CRYPTO_CAST6=m
1463CONFIG_CRYPTO_TEA=m 1489CONFIG_CRYPTO_DES=y
1464CONFIG_CRYPTO_ARC4=m 1490CONFIG_CRYPTO_FCRYPT=m
1465CONFIG_CRYPTO_KHAZAD=m 1491CONFIG_CRYPTO_KHAZAD=m
1466CONFIG_CRYPTO_ANUBIS=m
1467CONFIG_CRYPTO_SEED=m
1468# CONFIG_CRYPTO_SALSA20 is not set 1492# CONFIG_CRYPTO_SALSA20 is not set
1493CONFIG_CRYPTO_SEED=m
1494CONFIG_CRYPTO_SERPENT=m
1495CONFIG_CRYPTO_TEA=m
1496CONFIG_CRYPTO_TWOFISH=m
1497CONFIG_CRYPTO_TWOFISH_COMMON=m
1498
1499#
1500# Compression
1501#
1469CONFIG_CRYPTO_DEFLATE=y 1502CONFIG_CRYPTO_DEFLATE=y
1470CONFIG_CRYPTO_MICHAEL_MIC=m
1471CONFIG_CRYPTO_CRC32C=m
1472CONFIG_CRYPTO_CAMELLIA=m
1473CONFIG_CRYPTO_TEST=m
1474CONFIG_CRYPTO_AUTHENC=y
1475# CONFIG_CRYPTO_LZO is not set 1503# CONFIG_CRYPTO_LZO is not set
1476CONFIG_CRYPTO_HW=y 1504CONFIG_CRYPTO_HW=y
1477# CONFIG_CRYPTO_DEV_HIFN_795X is not set 1505# CONFIG_CRYPTO_DEV_HIFN_795X is not set
@@ -1492,3 +1520,4 @@ CONFIG_PLIST=y
1492CONFIG_HAS_IOMEM=y 1520CONFIG_HAS_IOMEM=y
1493CONFIG_HAS_IOPORT=y 1521CONFIG_HAS_IOPORT=y
1494CONFIG_HAS_DMA=y 1522CONFIG_HAS_DMA=y
1523CONFIG_HAVE_LMB=y
diff --git a/arch/sparc64/kernel/sysfs.c b/arch/sparc64/kernel/sysfs.c
index 52816c7be0b9..e885034a6b73 100644
--- a/arch/sparc64/kernel/sysfs.c
+++ b/arch/sparc64/kernel/sysfs.c
@@ -273,10 +273,22 @@ static void __init check_mmu_stats(void)
273 mmu_stats_supported = 1; 273 mmu_stats_supported = 1;
274} 274}
275 275
276static void register_nodes(void)
277{
278#ifdef CONFIG_NUMA
279 int i;
280
281 for (i = 0; i < MAX_NUMNODES; i++)
282 register_one_node(i);
283#endif
284}
285
276static int __init topology_init(void) 286static int __init topology_init(void)
277{ 287{
278 int cpu; 288 int cpu;
279 289
290 register_nodes();
291
280 check_mmu_stats(); 292 check_mmu_stats();
281 293
282 register_cpu_notifier(&sysfs_cpu_nb); 294 register_cpu_notifier(&sysfs_cpu_nb);
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 8e0e86787127..177d8aaeec42 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -25,6 +25,7 @@
25#include <linux/sort.h> 25#include <linux/sort.h>
26#include <linux/percpu.h> 26#include <linux/percpu.h>
27#include <linux/lmb.h> 27#include <linux/lmb.h>
28#include <linux/mmzone.h>
28 29
29#include <asm/head.h> 30#include <asm/head.h>
30#include <asm/system.h> 31#include <asm/system.h>
@@ -73,9 +74,7 @@ extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES];
73#define MAX_BANKS 32 74#define MAX_BANKS 32
74 75
75static struct linux_prom64_registers pavail[MAX_BANKS] __initdata; 76static struct linux_prom64_registers pavail[MAX_BANKS] __initdata;
76static struct linux_prom64_registers pavail_rescan[MAX_BANKS] __initdata;
77static int pavail_ents __initdata; 77static int pavail_ents __initdata;
78static int pavail_rescan_ents __initdata;
79 78
80static int cmp_p64(const void *a, const void *b) 79static int cmp_p64(const void *a, const void *b)
81{ 80{
@@ -716,19 +715,28 @@ out:
716 smp_new_mmu_context_version(); 715 smp_new_mmu_context_version();
717} 716}
718 717
719/* Find a free area for the bootmem map, avoiding the kernel image 718static int numa_enabled = 1;
720 * and the initial ramdisk. 719static int numa_debug;
721 */ 720
722static unsigned long __init choose_bootmap_pfn(unsigned long start_pfn, 721static int __init early_numa(char *p)
723 unsigned long end_pfn)
724{ 722{
725 unsigned long bootmap_size; 723 if (!p)
724 return 0;
725
726 if (strstr(p, "off"))
727 numa_enabled = 0;
726 728
727 bootmap_size = bootmem_bootmap_pages(end_pfn - start_pfn); 729 if (strstr(p, "debug"))
728 bootmap_size <<= PAGE_SHIFT; 730 numa_debug = 1;
729 731
730 return lmb_alloc(bootmap_size, PAGE_SIZE) >> PAGE_SHIFT; 732 return 0;
731} 733}
734early_param("numa", early_numa);
735
736#define numadbg(f, a...) \
737do { if (numa_debug) \
738 printk(KERN_INFO f, ## a); \
739} while (0)
732 740
733static void __init find_ramdisk(unsigned long phys_base) 741static void __init find_ramdisk(unsigned long phys_base)
734{ 742{
@@ -755,6 +763,9 @@ static void __init find_ramdisk(unsigned long phys_base)
755 ramdisk_image -= KERNBASE; 763 ramdisk_image -= KERNBASE;
756 ramdisk_image += phys_base; 764 ramdisk_image += phys_base;
757 765
766 numadbg("Found ramdisk at physical address 0x%lx, size %u\n",
767 ramdisk_image, sparc_ramdisk_size);
768
758 initrd_start = ramdisk_image; 769 initrd_start = ramdisk_image;
759 initrd_end = ramdisk_image + sparc_ramdisk_size; 770 initrd_end = ramdisk_image + sparc_ramdisk_size;
760 771
@@ -763,60 +774,625 @@ static void __init find_ramdisk(unsigned long phys_base)
763#endif 774#endif
764} 775}
765 776
766/* About pages_avail, this is the value we will use to calculate 777struct node_mem_mask {
767 * the zholes_size[] argument given to free_area_init_node(). The 778 unsigned long mask;
768 * page allocator uses this to calculate nr_kernel_pages, 779 unsigned long val;
769 * nr_all_pages and zone->present_pages. On NUMA it is used 780 unsigned long bootmem_paddr;
770 * to calculate zone->min_unmapped_pages and zone->min_slab_pages. 781};
771 * 782static struct node_mem_mask node_masks[MAX_NUMNODES];
772 * So this number should really be set to what the page allocator 783static int num_node_masks;
773 * actually ends up with. This means: 784
774 * 1) It should include bootmem map pages, we'll release those. 785int numa_cpu_lookup_table[NR_CPUS];
775 * 2) It should not include the kernel image, except for the 786cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
776 * __init sections which we will also release. 787
777 * 3) It should include the initrd image, since we'll release 788#ifdef CONFIG_NEED_MULTIPLE_NODES
778 * that too. 789static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
790
791struct mdesc_mblock {
792 u64 base;
793 u64 size;
794 u64 offset; /* RA-to-PA */
795};
796static struct mdesc_mblock *mblocks;
797static int num_mblocks;
798
799static unsigned long ra_to_pa(unsigned long addr)
800{
801 int i;
802
803 for (i = 0; i < num_mblocks; i++) {
804 struct mdesc_mblock *m = &mblocks[i];
805
806 if (addr >= m->base &&
807 addr < (m->base + m->size)) {
808 addr += m->offset;
809 break;
810 }
811 }
812 return addr;
813}
814
815static int find_node(unsigned long addr)
816{
817 int i;
818
819 addr = ra_to_pa(addr);
820 for (i = 0; i < num_node_masks; i++) {
821 struct node_mem_mask *p = &node_masks[i];
822
823 if ((addr & p->mask) == p->val)
824 return i;
825 }
826 return -1;
827}
828
829static unsigned long nid_range(unsigned long start, unsigned long end,
830 int *nid)
831{
832 *nid = find_node(start);
833 start += PAGE_SIZE;
834 while (start < end) {
835 int n = find_node(start);
836
837 if (n != *nid)
838 break;
839 start += PAGE_SIZE;
840 }
841
842 return start;
843}
844#else
845static unsigned long nid_range(unsigned long start, unsigned long end,
846 int *nid)
847{
848 *nid = 0;
849 return end;
850}
851#endif
852
853/* This must be invoked after performing all of the necessary
854 * add_active_range() calls for 'nid'. We need to be able to get
855 * correct data from get_pfn_range_for_nid().
779 */ 856 */
780static unsigned long __init bootmem_init(unsigned long *pages_avail, 857static void __init allocate_node_data(int nid)
781 unsigned long phys_base) 858{
859 unsigned long paddr, num_pages, start_pfn, end_pfn;
860 struct pglist_data *p;
861
862#ifdef CONFIG_NEED_MULTIPLE_NODES
863 paddr = lmb_alloc_nid(sizeof(struct pglist_data),
864 SMP_CACHE_BYTES, nid, nid_range);
865 if (!paddr) {
866 prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid);
867 prom_halt();
868 }
869 NODE_DATA(nid) = __va(paddr);
870 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
871
872 NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
873#endif
874
875 p = NODE_DATA(nid);
876
877 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
878 p->node_start_pfn = start_pfn;
879 p->node_spanned_pages = end_pfn - start_pfn;
880
881 if (p->node_spanned_pages) {
882 num_pages = bootmem_bootmap_pages(p->node_spanned_pages);
883
884 paddr = lmb_alloc_nid(num_pages << PAGE_SHIFT, PAGE_SIZE, nid,
885 nid_range);
886 if (!paddr) {
887 prom_printf("Cannot allocate bootmap for nid[%d]\n",
888 nid);
889 prom_halt();
890 }
891 node_masks[nid].bootmem_paddr = paddr;
892 }
893}
894
895static void init_node_masks_nonnuma(void)
782{ 896{
783 unsigned long end_pfn;
784 int i; 897 int i;
785 898
786 *pages_avail = lmb_phys_mem_size() >> PAGE_SHIFT; 899 numadbg("Initializing tables for non-numa.\n");
787 end_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
788 900
789 /* Initialize the boot-time allocator. */ 901 node_masks[0].mask = node_masks[0].val = 0;
790 max_pfn = max_low_pfn = end_pfn; 902 num_node_masks = 1;
791 min_low_pfn = (phys_base >> PAGE_SHIFT);
792 903
793 init_bootmem_node(NODE_DATA(0), 904 for (i = 0; i < NR_CPUS; i++)
794 choose_bootmap_pfn(min_low_pfn, end_pfn), 905 numa_cpu_lookup_table[i] = 0;
795 min_low_pfn, end_pfn);
796 906
797 /* Now register the available physical memory with the 907 numa_cpumask_lookup_table[0] = CPU_MASK_ALL;
798 * allocator. 908}
799 */ 909
800 for (i = 0; i < lmb.memory.cnt; i++) 910#ifdef CONFIG_NEED_MULTIPLE_NODES
801 free_bootmem(lmb.memory.region[i].base, 911struct pglist_data *node_data[MAX_NUMNODES];
802 lmb_size_bytes(&lmb.memory, i)); 912
913EXPORT_SYMBOL(numa_cpu_lookup_table);
914EXPORT_SYMBOL(numa_cpumask_lookup_table);
915EXPORT_SYMBOL(node_data);
916
917struct mdesc_mlgroup {
918 u64 node;
919 u64 latency;
920 u64 match;
921 u64 mask;
922};
923static struct mdesc_mlgroup *mlgroups;
924static int num_mlgroups;
925
926static int scan_pio_for_cfg_handle(struct mdesc_handle *md, u64 pio,
927 u32 cfg_handle)
928{
929 u64 arc;
930
931 mdesc_for_each_arc(arc, md, pio, MDESC_ARC_TYPE_FWD) {
932 u64 target = mdesc_arc_target(md, arc);
933 const u64 *val;
934
935 val = mdesc_get_property(md, target,
936 "cfg-handle", NULL);
937 if (val && *val == cfg_handle)
938 return 0;
939 }
940 return -ENODEV;
941}
942
943static int scan_arcs_for_cfg_handle(struct mdesc_handle *md, u64 grp,
944 u32 cfg_handle)
945{
946 u64 arc, candidate, best_latency = ~(u64)0;
947
948 candidate = MDESC_NODE_NULL;
949 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) {
950 u64 target = mdesc_arc_target(md, arc);
951 const char *name = mdesc_node_name(md, target);
952 const u64 *val;
953
954 if (strcmp(name, "pio-latency-group"))
955 continue;
956
957 val = mdesc_get_property(md, target, "latency", NULL);
958 if (!val)
959 continue;
960
961 if (*val < best_latency) {
962 candidate = target;
963 best_latency = *val;
964 }
965 }
966
967 if (candidate == MDESC_NODE_NULL)
968 return -ENODEV;
969
970 return scan_pio_for_cfg_handle(md, candidate, cfg_handle);
971}
972
973int of_node_to_nid(struct device_node *dp)
974{
975 const struct linux_prom64_registers *regs;
976 struct mdesc_handle *md;
977 u32 cfg_handle;
978 int count, nid;
979 u64 grp;
980
981 if (!mlgroups)
982 return -1;
983
984 regs = of_get_property(dp, "reg", NULL);
985 if (!regs)
986 return -1;
987
988 cfg_handle = (regs->phys_addr >> 32UL) & 0x0fffffff;
989
990 md = mdesc_grab();
991
992 count = 0;
993 nid = -1;
994 mdesc_for_each_node_by_name(md, grp, "group") {
995 if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) {
996 nid = count;
997 break;
998 }
999 count++;
1000 }
1001
1002 mdesc_release(md);
1003
1004 return nid;
1005}
1006
1007static void add_node_ranges(void)
1008{
1009 int i;
1010
1011 for (i = 0; i < lmb.memory.cnt; i++) {
1012 unsigned long size = lmb_size_bytes(&lmb.memory, i);
1013 unsigned long start, end;
1014
1015 start = lmb.memory.region[i].base;
1016 end = start + size;
1017 while (start < end) {
1018 unsigned long this_end;
1019 int nid;
1020
1021 this_end = nid_range(start, end, &nid);
1022
1023 numadbg("Adding active range nid[%d] "
1024 "start[%lx] end[%lx]\n",
1025 nid, start, this_end);
1026
1027 add_active_range(nid,
1028 start >> PAGE_SHIFT,
1029 this_end >> PAGE_SHIFT);
1030
1031 start = this_end;
1032 }
1033 }
1034}
803 1035
804 for (i = 0; i < lmb.reserved.cnt; i++) 1036static int __init grab_mlgroups(struct mdesc_handle *md)
805 reserve_bootmem(lmb.reserved.region[i].base, 1037{
806 lmb_size_bytes(&lmb.reserved, i), 1038 unsigned long paddr;
807 BOOTMEM_DEFAULT); 1039 int count = 0;
1040 u64 node;
1041
1042 mdesc_for_each_node_by_name(md, node, "memory-latency-group")
1043 count++;
1044 if (!count)
1045 return -ENOENT;
1046
1047 paddr = lmb_alloc(count * sizeof(struct mdesc_mlgroup),
1048 SMP_CACHE_BYTES);
1049 if (!paddr)
1050 return -ENOMEM;
1051
1052 mlgroups = __va(paddr);
1053 num_mlgroups = count;
1054
1055 count = 0;
1056 mdesc_for_each_node_by_name(md, node, "memory-latency-group") {
1057 struct mdesc_mlgroup *m = &mlgroups[count++];
1058 const u64 *val;
1059
1060 m->node = node;
1061
1062 val = mdesc_get_property(md, node, "latency", NULL);
1063 m->latency = *val;
1064 val = mdesc_get_property(md, node, "address-match", NULL);
1065 m->match = *val;
1066 val = mdesc_get_property(md, node, "address-mask", NULL);
1067 m->mask = *val;
1068
1069 numadbg("MLGROUP[%d]: node[%lx] latency[%lx] "
1070 "match[%lx] mask[%lx]\n",
1071 count - 1, m->node, m->latency, m->match, m->mask);
1072 }
808 1073
809 *pages_avail -= PAGE_ALIGN(kern_size) >> PAGE_SHIFT; 1074 return 0;
1075}
810 1076
811 for (i = 0; i < lmb.memory.cnt; ++i) { 1077static int __init grab_mblocks(struct mdesc_handle *md)
812 unsigned long start_pfn, end_pfn, pages; 1078{
1079 unsigned long paddr;
1080 int count = 0;
1081 u64 node;
1082
1083 mdesc_for_each_node_by_name(md, node, "mblock")
1084 count++;
1085 if (!count)
1086 return -ENOENT;
1087
1088 paddr = lmb_alloc(count * sizeof(struct mdesc_mblock),
1089 SMP_CACHE_BYTES);
1090 if (!paddr)
1091 return -ENOMEM;
1092
1093 mblocks = __va(paddr);
1094 num_mblocks = count;
1095
1096 count = 0;
1097 mdesc_for_each_node_by_name(md, node, "mblock") {
1098 struct mdesc_mblock *m = &mblocks[count++];
1099 const u64 *val;
1100
1101 val = mdesc_get_property(md, node, "base", NULL);
1102 m->base = *val;
1103 val = mdesc_get_property(md, node, "size", NULL);
1104 m->size = *val;
1105 val = mdesc_get_property(md, node,
1106 "address-congruence-offset", NULL);
1107 m->offset = *val;
1108
1109 numadbg("MBLOCK[%d]: base[%lx] size[%lx] offset[%lx]\n",
1110 count - 1, m->base, m->size, m->offset);
1111 }
1112
1113 return 0;
1114}
1115
1116static void __init numa_parse_mdesc_group_cpus(struct mdesc_handle *md,
1117 u64 grp, cpumask_t *mask)
1118{
1119 u64 arc;
1120
1121 cpus_clear(*mask);
1122
1123 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_BACK) {
1124 u64 target = mdesc_arc_target(md, arc);
1125 const char *name = mdesc_node_name(md, target);
1126 const u64 *id;
1127
1128 if (strcmp(name, "cpu"))
1129 continue;
1130 id = mdesc_get_property(md, target, "id", NULL);
1131 if (*id < NR_CPUS)
1132 cpu_set(*id, *mask);
1133 }
1134}
1135
1136static struct mdesc_mlgroup * __init find_mlgroup(u64 node)
1137{
1138 int i;
1139
1140 for (i = 0; i < num_mlgroups; i++) {
1141 struct mdesc_mlgroup *m = &mlgroups[i];
1142 if (m->node == node)
1143 return m;
1144 }
1145 return NULL;
1146}
1147
1148static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp,
1149 int index)
1150{
1151 struct mdesc_mlgroup *candidate = NULL;
1152 u64 arc, best_latency = ~(u64)0;
1153 struct node_mem_mask *n;
1154
1155 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) {
1156 u64 target = mdesc_arc_target(md, arc);
1157 struct mdesc_mlgroup *m = find_mlgroup(target);
1158 if (!m)
1159 continue;
1160 if (m->latency < best_latency) {
1161 candidate = m;
1162 best_latency = m->latency;
1163 }
1164 }
1165 if (!candidate)
1166 return -ENOENT;
1167
1168 if (num_node_masks != index) {
1169 printk(KERN_ERR "Inconsistent NUMA state, "
1170 "index[%d] != num_node_masks[%d]\n",
1171 index, num_node_masks);
1172 return -EINVAL;
1173 }
1174
1175 n = &node_masks[num_node_masks++];
1176
1177 n->mask = candidate->mask;
1178 n->val = candidate->match;
1179
1180 numadbg("NUMA NODE[%d]: mask[%lx] val[%lx] (latency[%lx])\n",
1181 index, n->mask, n->val, candidate->latency);
1182
1183 return 0;
1184}
1185
1186static int __init numa_parse_mdesc_group(struct mdesc_handle *md, u64 grp,
1187 int index)
1188{
1189 cpumask_t mask;
1190 int cpu;
1191
1192 numa_parse_mdesc_group_cpus(md, grp, &mask);
1193
1194 for_each_cpu_mask(cpu, mask)
1195 numa_cpu_lookup_table[cpu] = index;
1196 numa_cpumask_lookup_table[index] = mask;
1197
1198 if (numa_debug) {
1199 printk(KERN_INFO "NUMA GROUP[%d]: cpus [ ", index);
1200 for_each_cpu_mask(cpu, mask)
1201 printk("%d ", cpu);
1202 printk("]\n");
1203 }
1204
1205 return numa_attach_mlgroup(md, grp, index);
1206}
1207
1208static int __init numa_parse_mdesc(void)
1209{
1210 struct mdesc_handle *md = mdesc_grab();
1211 int i, err, count;
1212 u64 node;
1213
1214 node = mdesc_node_by_name(md, MDESC_NODE_NULL, "latency-groups");
1215 if (node == MDESC_NODE_NULL) {
1216 mdesc_release(md);
1217 return -ENOENT;
1218 }
1219
1220 err = grab_mblocks(md);
1221 if (err < 0)
1222 goto out;
1223
1224 err = grab_mlgroups(md);
1225 if (err < 0)
1226 goto out;
1227
1228 count = 0;
1229 mdesc_for_each_node_by_name(md, node, "group") {
1230 err = numa_parse_mdesc_group(md, node, count);
1231 if (err < 0)
1232 break;
1233 count++;
1234 }
1235
1236 add_node_ranges();
1237
1238 for (i = 0; i < num_node_masks; i++) {
1239 allocate_node_data(i);
1240 node_set_online(i);
1241 }
1242
1243 err = 0;
1244out:
1245 mdesc_release(md);
1246 return err;
1247}
1248
1249static int __init numa_parse_sun4u(void)
1250{
1251 return -1;
1252}
1253
1254static int __init bootmem_init_numa(void)
1255{
1256 int err = -1;
1257
1258 numadbg("bootmem_init_numa()\n");
1259
1260 if (numa_enabled) {
1261 if (tlb_type == hypervisor)
1262 err = numa_parse_mdesc();
1263 else
1264 err = numa_parse_sun4u();
1265 }
1266 return err;
1267}
1268
1269#else
1270
1271static int bootmem_init_numa(void)
1272{
1273 return -1;
1274}
1275
1276#endif
1277
1278static void __init bootmem_init_nonnuma(void)
1279{
1280 unsigned long top_of_ram = lmb_end_of_DRAM();
1281 unsigned long total_ram = lmb_phys_mem_size();
1282 unsigned int i;
1283
1284 numadbg("bootmem_init_nonnuma()\n");
1285
1286 printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
1287 top_of_ram, total_ram);
1288 printk(KERN_INFO "Memory hole size: %ldMB\n",
1289 (top_of_ram - total_ram) >> 20);
1290
1291 init_node_masks_nonnuma();
1292
1293 for (i = 0; i < lmb.memory.cnt; i++) {
1294 unsigned long size = lmb_size_bytes(&lmb.memory, i);
1295 unsigned long start_pfn, end_pfn;
1296
1297 if (!size)
1298 continue;
813 1299
814 pages = lmb_size_pages(&lmb.memory, i);
815 start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT; 1300 start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
816 end_pfn = start_pfn + pages; 1301 end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
1302 add_active_range(0, start_pfn, end_pfn);
1303 }
817 1304
818 memory_present(0, start_pfn, end_pfn); 1305 allocate_node_data(0);
1306
1307 node_set_online(0);
1308}
1309
1310static void __init reserve_range_in_node(int nid, unsigned long start,
1311 unsigned long end)
1312{
1313 numadbg(" reserve_range_in_node(nid[%d],start[%lx],end[%lx]\n",
1314 nid, start, end);
1315 while (start < end) {
1316 unsigned long this_end;
1317 int n;
1318
1319 this_end = nid_range(start, end, &n);
1320 if (n == nid) {
1321 numadbg(" MATCH reserving range [%lx:%lx]\n",
1322 start, this_end);
1323 reserve_bootmem_node(NODE_DATA(nid), start,
1324 (this_end - start), BOOTMEM_DEFAULT);
1325 } else
1326 numadbg(" NO MATCH, advancing start to %lx\n",
1327 this_end);
1328
1329 start = this_end;
819 } 1330 }
1331}
1332
1333static void __init trim_reserved_in_node(int nid)
1334{
1335 int i;
1336
1337 numadbg(" trim_reserved_in_node(%d)\n", nid);
1338
1339 for (i = 0; i < lmb.reserved.cnt; i++) {
1340 unsigned long start = lmb.reserved.region[i].base;
1341 unsigned long size = lmb_size_bytes(&lmb.reserved, i);
1342 unsigned long end = start + size;
1343
1344 reserve_range_in_node(nid, start, end);
1345 }
1346}
1347
1348static void __init bootmem_init_one_node(int nid)
1349{
1350 struct pglist_data *p;
1351
1352 numadbg("bootmem_init_one_node(%d)\n", nid);
1353
1354 p = NODE_DATA(nid);
1355
1356 if (p->node_spanned_pages) {
1357 unsigned long paddr = node_masks[nid].bootmem_paddr;
1358 unsigned long end_pfn;
1359
1360 end_pfn = p->node_start_pfn + p->node_spanned_pages;
1361
1362 numadbg(" init_bootmem_node(%d, %lx, %lx, %lx)\n",
1363 nid, paddr >> PAGE_SHIFT, p->node_start_pfn, end_pfn);
1364
1365 init_bootmem_node(p, paddr >> PAGE_SHIFT,
1366 p->node_start_pfn, end_pfn);
1367
1368 numadbg(" free_bootmem_with_active_regions(%d, %lx)\n",
1369 nid, end_pfn);
1370 free_bootmem_with_active_regions(nid, end_pfn);
1371
1372 trim_reserved_in_node(nid);
1373
1374 numadbg(" sparse_memory_present_with_active_regions(%d)\n",
1375 nid);
1376 sparse_memory_present_with_active_regions(nid);
1377 }
1378}
1379
1380static unsigned long __init bootmem_init(unsigned long phys_base)
1381{
1382 unsigned long end_pfn;
1383 int nid;
1384
1385 end_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
1386 max_pfn = max_low_pfn = end_pfn;
1387 min_low_pfn = (phys_base >> PAGE_SHIFT);
1388
1389 if (bootmem_init_numa() < 0)
1390 bootmem_init_nonnuma();
1391
1392 /* XXX cpu notifier XXX */
1393
1394 for_each_online_node(nid)
1395 bootmem_init_one_node(nid);
820 1396
821 sparse_init(); 1397 sparse_init();
822 1398
@@ -1112,7 +1688,7 @@ void __init setup_per_cpu_areas(void)
1112 1688
1113void __init paging_init(void) 1689void __init paging_init(void)
1114{ 1690{
1115 unsigned long end_pfn, pages_avail, shift, phys_base; 1691 unsigned long end_pfn, shift, phys_base;
1116 unsigned long real_end, i; 1692 unsigned long real_end, i;
1117 1693
1118 /* These build time checkes make sure that the dcache_dirty_cpu() 1694 /* These build time checkes make sure that the dcache_dirty_cpu()
@@ -1220,27 +1796,21 @@ void __init paging_init(void)
1220 sun4v_mdesc_init(); 1796 sun4v_mdesc_init();
1221 1797
1222 /* Setup bootmem... */ 1798 /* Setup bootmem... */
1223 pages_avail = 0; 1799 last_valid_pfn = end_pfn = bootmem_init(phys_base);
1224 last_valid_pfn = end_pfn = bootmem_init(&pages_avail, phys_base);
1225 1800
1801#ifndef CONFIG_NEED_MULTIPLE_NODES
1226 max_mapnr = last_valid_pfn; 1802 max_mapnr = last_valid_pfn;
1227 1803#endif
1228 kernel_physical_mapping_init(); 1804 kernel_physical_mapping_init();
1229 1805
1230 { 1806 {
1231 unsigned long zones_size[MAX_NR_ZONES]; 1807 unsigned long max_zone_pfns[MAX_NR_ZONES];
1232 unsigned long zholes_size[MAX_NR_ZONES];
1233 int znum;
1234 1808
1235 for (znum = 0; znum < MAX_NR_ZONES; znum++) 1809 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
1236 zones_size[znum] = zholes_size[znum] = 0;
1237 1810
1238 zones_size[ZONE_NORMAL] = end_pfn; 1811 max_zone_pfns[ZONE_NORMAL] = end_pfn;
1239 zholes_size[ZONE_NORMAL] = end_pfn - pages_avail;
1240 1812
1241 free_area_init_node(0, &contig_page_data, zones_size, 1813 free_area_init_nodes(max_zone_pfns);
1242 __pa(PAGE_OFFSET) >> PAGE_SHIFT,
1243 zholes_size);
1244 } 1814 }
1245 1815
1246 printk("Booting Linux...\n"); 1816 printk("Booting Linux...\n");
@@ -1249,21 +1819,52 @@ void __init paging_init(void)
1249 cpu_probe(); 1819 cpu_probe();
1250} 1820}
1251 1821
1252static void __init taint_real_pages(void) 1822int __init page_in_phys_avail(unsigned long paddr)
1823{
1824 int i;
1825
1826 paddr &= PAGE_MASK;
1827
1828 for (i = 0; i < pavail_ents; i++) {
1829 unsigned long start, end;
1830
1831 start = pavail[i].phys_addr;
1832 end = start + pavail[i].reg_size;
1833
1834 if (paddr >= start && paddr < end)
1835 return 1;
1836 }
1837 if (paddr >= kern_base && paddr < (kern_base + kern_size))
1838 return 1;
1839#ifdef CONFIG_BLK_DEV_INITRD
1840 if (paddr >= __pa(initrd_start) &&
1841 paddr < __pa(PAGE_ALIGN(initrd_end)))
1842 return 1;
1843#endif
1844
1845 return 0;
1846}
1847
1848static struct linux_prom64_registers pavail_rescan[MAX_BANKS] __initdata;
1849static int pavail_rescan_ents __initdata;
1850
1851/* Certain OBP calls, such as fetching "available" properties, can
1852 * claim physical memory. So, along with initializing the valid
1853 * address bitmap, what we do here is refetch the physical available
1854 * memory list again, and make sure it provides at least as much
1855 * memory as 'pavail' does.
1856 */
1857static void setup_valid_addr_bitmap_from_pavail(void)
1253{ 1858{
1254 int i; 1859 int i;
1255 1860
1256 read_obp_memory("available", &pavail_rescan[0], &pavail_rescan_ents); 1861 read_obp_memory("available", &pavail_rescan[0], &pavail_rescan_ents);
1257 1862
1258 /* Find changes discovered in the physmem available rescan and
1259 * reserve the lost portions in the bootmem maps.
1260 */
1261 for (i = 0; i < pavail_ents; i++) { 1863 for (i = 0; i < pavail_ents; i++) {
1262 unsigned long old_start, old_end; 1864 unsigned long old_start, old_end;
1263 1865
1264 old_start = pavail[i].phys_addr; 1866 old_start = pavail[i].phys_addr;
1265 old_end = old_start + 1867 old_end = old_start + pavail[i].reg_size;
1266 pavail[i].reg_size;
1267 while (old_start < old_end) { 1868 while (old_start < old_end) {
1268 int n; 1869 int n;
1269 1870
@@ -1281,7 +1882,16 @@ static void __init taint_real_pages(void)
1281 goto do_next_page; 1882 goto do_next_page;
1282 } 1883 }
1283 } 1884 }
1284 reserve_bootmem(old_start, PAGE_SIZE, BOOTMEM_DEFAULT); 1885
1886 prom_printf("mem_init: Lost memory in pavail\n");
1887 prom_printf("mem_init: OLD start[%lx] size[%lx]\n",
1888 pavail[i].phys_addr,
1889 pavail[i].reg_size);
1890 prom_printf("mem_init: NEW start[%lx] size[%lx]\n",
1891 pavail_rescan[i].phys_addr,
1892 pavail_rescan[i].reg_size);
1893 prom_printf("mem_init: Cannot continue, aborting.\n");
1894 prom_halt();
1285 1895
1286 do_next_page: 1896 do_next_page:
1287 old_start += PAGE_SIZE; 1897 old_start += PAGE_SIZE;
@@ -1289,32 +1899,6 @@ static void __init taint_real_pages(void)
1289 } 1899 }
1290} 1900}
1291 1901
1292int __init page_in_phys_avail(unsigned long paddr)
1293{
1294 int i;
1295
1296 paddr &= PAGE_MASK;
1297
1298 for (i = 0; i < pavail_rescan_ents; i++) {
1299 unsigned long start, end;
1300
1301 start = pavail_rescan[i].phys_addr;
1302 end = start + pavail_rescan[i].reg_size;
1303
1304 if (paddr >= start && paddr < end)
1305 return 1;
1306 }
1307 if (paddr >= kern_base && paddr < (kern_base + kern_size))
1308 return 1;
1309#ifdef CONFIG_BLK_DEV_INITRD
1310 if (paddr >= __pa(initrd_start) &&
1311 paddr < __pa(PAGE_ALIGN(initrd_end)))
1312 return 1;
1313#endif
1314
1315 return 0;
1316}
1317
1318void __init mem_init(void) 1902void __init mem_init(void)
1319{ 1903{
1320 unsigned long codepages, datapages, initpages; 1904 unsigned long codepages, datapages, initpages;
@@ -1337,14 +1921,26 @@ void __init mem_init(void)
1337 addr += PAGE_SIZE; 1921 addr += PAGE_SIZE;
1338 } 1922 }
1339 1923
1340 taint_real_pages(); 1924 setup_valid_addr_bitmap_from_pavail();
1341 1925
1342 high_memory = __va(last_valid_pfn << PAGE_SHIFT); 1926 high_memory = __va(last_valid_pfn << PAGE_SHIFT);
1343 1927
1928#ifdef CONFIG_NEED_MULTIPLE_NODES
1929 for_each_online_node(i) {
1930 if (NODE_DATA(i)->node_spanned_pages != 0) {
1931 totalram_pages +=
1932 free_all_bootmem_node(NODE_DATA(i));
1933 }
1934 }
1935#else
1936 totalram_pages = free_all_bootmem();
1937#endif
1938
1344 /* We subtract one to account for the mem_map_zero page 1939 /* We subtract one to account for the mem_map_zero page
1345 * allocated below. 1940 * allocated below.
1346 */ 1941 */
1347 totalram_pages = num_physpages = free_all_bootmem() - 1; 1942 totalram_pages -= 1;
1943 num_physpages = totalram_pages;
1348 1944
1349 /* 1945 /*
1350 * Set up the zero page, mark it reserved, so that page count 1946 * Set up the zero page, mark it reserved, so that page count