aboutsummaryrefslogtreecommitdiffstats
path: root/arch/sparc64/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/sparc64/mm')
-rw-r--r--arch/sparc64/mm/init.c989
-rw-r--r--arch/sparc64/mm/tsb.c3
-rw-r--r--arch/sparc64/mm/ultra.S4
3 files changed, 716 insertions, 280 deletions
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index f37078d96407..177d8aaeec42 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -24,6 +24,8 @@
24#include <linux/cache.h> 24#include <linux/cache.h>
25#include <linux/sort.h> 25#include <linux/sort.h>
26#include <linux/percpu.h> 26#include <linux/percpu.h>
27#include <linux/lmb.h>
28#include <linux/mmzone.h>
27 29
28#include <asm/head.h> 30#include <asm/head.h>
29#include <asm/system.h> 31#include <asm/system.h>
@@ -72,9 +74,7 @@ extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES];
72#define MAX_BANKS 32 74#define MAX_BANKS 32
73 75
74static struct linux_prom64_registers pavail[MAX_BANKS] __initdata; 76static struct linux_prom64_registers pavail[MAX_BANKS] __initdata;
75static struct linux_prom64_registers pavail_rescan[MAX_BANKS] __initdata;
76static int pavail_ents __initdata; 77static int pavail_ents __initdata;
77static int pavail_rescan_ents __initdata;
78 78
79static int cmp_p64(const void *a, const void *b) 79static int cmp_p64(const void *a, const void *b)
80{ 80{
@@ -715,285 +715,684 @@ out:
715 smp_new_mmu_context_version(); 715 smp_new_mmu_context_version();
716} 716}
717 717
718/* Find a free area for the bootmem map, avoiding the kernel image 718static int numa_enabled = 1;
719 * and the initial ramdisk. 719static int numa_debug;
720 */ 720
721static unsigned long __init choose_bootmap_pfn(unsigned long start_pfn, 721static int __init early_numa(char *p)
722 unsigned long end_pfn)
723{ 722{
724 unsigned long avoid_start, avoid_end, bootmap_size; 723 if (!p)
725 int i; 724 return 0;
725
726 if (strstr(p, "off"))
727 numa_enabled = 0;
728
729 if (strstr(p, "debug"))
730 numa_debug = 1;
731
732 return 0;
733}
734early_param("numa", early_numa);
726 735
727 bootmap_size = bootmem_bootmap_pages(end_pfn - start_pfn); 736#define numadbg(f, a...) \
728 bootmap_size <<= PAGE_SHIFT; 737do { if (numa_debug) \
738 printk(KERN_INFO f, ## a); \
739} while (0)
729 740
730 avoid_start = avoid_end = 0; 741static void __init find_ramdisk(unsigned long phys_base)
742{
731#ifdef CONFIG_BLK_DEV_INITRD 743#ifdef CONFIG_BLK_DEV_INITRD
732 avoid_start = initrd_start; 744 if (sparc_ramdisk_image || sparc_ramdisk_image64) {
733 avoid_end = PAGE_ALIGN(initrd_end); 745 unsigned long ramdisk_image;
746
747 /* Older versions of the bootloader only supported a
748 * 32-bit physical address for the ramdisk image
749 * location, stored at sparc_ramdisk_image. Newer
750 * SILO versions set sparc_ramdisk_image to zero and
751 * provide a full 64-bit physical address at
752 * sparc_ramdisk_image64.
753 */
754 ramdisk_image = sparc_ramdisk_image;
755 if (!ramdisk_image)
756 ramdisk_image = sparc_ramdisk_image64;
757
758 /* Another bootloader quirk. The bootloader normalizes
759 * the physical address to KERNBASE, so we have to
760 * factor that back out and add in the lowest valid
761 * physical page address to get the true physical address.
762 */
763 ramdisk_image -= KERNBASE;
764 ramdisk_image += phys_base;
765
766 numadbg("Found ramdisk at physical address 0x%lx, size %u\n",
767 ramdisk_image, sparc_ramdisk_size);
768
769 initrd_start = ramdisk_image;
770 initrd_end = ramdisk_image + sparc_ramdisk_size;
771
772 lmb_reserve(initrd_start, initrd_end);
773 }
734#endif 774#endif
775}
735 776
736 for (i = 0; i < pavail_ents; i++) { 777struct node_mem_mask {
737 unsigned long start, end; 778 unsigned long mask;
779 unsigned long val;
780 unsigned long bootmem_paddr;
781};
782static struct node_mem_mask node_masks[MAX_NUMNODES];
783static int num_node_masks;
738 784
739 start = pavail[i].phys_addr; 785int numa_cpu_lookup_table[NR_CPUS];
740 end = start + pavail[i].reg_size; 786cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
741 787
742 while (start < end) { 788#ifdef CONFIG_NEED_MULTIPLE_NODES
743 if (start >= kern_base && 789static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
744 start < PAGE_ALIGN(kern_base + kern_size)) {
745 start = PAGE_ALIGN(kern_base + kern_size);
746 continue;
747 }
748 if (start >= avoid_start && start < avoid_end) {
749 start = avoid_end;
750 continue;
751 }
752 790
753 if ((end - start) < bootmap_size) 791struct mdesc_mblock {
754 break; 792 u64 base;
793 u64 size;
794 u64 offset; /* RA-to-PA */
795};
796static struct mdesc_mblock *mblocks;
797static int num_mblocks;
755 798
756 if (start < kern_base && 799static unsigned long ra_to_pa(unsigned long addr)
757 (start + bootmap_size) > kern_base) { 800{
758 start = PAGE_ALIGN(kern_base + kern_size); 801 int i;
759 continue;
760 }
761 802
762 if (start < avoid_start && 803 for (i = 0; i < num_mblocks; i++) {
763 (start + bootmap_size) > avoid_start) { 804 struct mdesc_mblock *m = &mblocks[i];
764 start = avoid_end;
765 continue;
766 }
767 805
768 /* OK, it doesn't overlap anything, use it. */ 806 if (addr >= m->base &&
769 return start >> PAGE_SHIFT; 807 addr < (m->base + m->size)) {
808 addr += m->offset;
809 break;
770 } 810 }
771 } 811 }
772 812 return addr;
773 prom_printf("Cannot find free area for bootmap, aborting.\n");
774 prom_halt();
775} 813}
776 814
777static void __init trim_pavail(unsigned long *cur_size_p, 815static int find_node(unsigned long addr)
778 unsigned long *end_of_phys_p)
779{ 816{
780 unsigned long to_trim = *cur_size_p - cmdline_memory_size;
781 unsigned long avoid_start, avoid_end;
782 int i; 817 int i;
783 818
784 to_trim = PAGE_ALIGN(to_trim); 819 addr = ra_to_pa(addr);
820 for (i = 0; i < num_node_masks; i++) {
821 struct node_mem_mask *p = &node_masks[i];
785 822
786 avoid_start = avoid_end = 0; 823 if ((addr & p->mask) == p->val)
787#ifdef CONFIG_BLK_DEV_INITRD 824 return i;
788 avoid_start = initrd_start; 825 }
789 avoid_end = PAGE_ALIGN(initrd_end); 826 return -1;
827}
828
829static unsigned long nid_range(unsigned long start, unsigned long end,
830 int *nid)
831{
832 *nid = find_node(start);
833 start += PAGE_SIZE;
834 while (start < end) {
835 int n = find_node(start);
836
837 if (n != *nid)
838 break;
839 start += PAGE_SIZE;
840 }
841
842 return start;
843}
844#else
845static unsigned long nid_range(unsigned long start, unsigned long end,
846 int *nid)
847{
848 *nid = 0;
849 return end;
850}
790#endif 851#endif
791 852
792 /* Trim some pavail[] entries in order to satisfy the 853/* This must be invoked after performing all of the necessary
793 * requested "mem=xxx" kernel command line specification. 854 * add_active_range() calls for 'nid'. We need to be able to get
794 * 855 * correct data from get_pfn_range_for_nid().
795 * We must not trim off the kernel image area nor the 856 */
796 * initial ramdisk range (if any). Also, we must not trim 857static void __init allocate_node_data(int nid)
797 * any pavail[] entry down to zero in order to preserve 858{
798 * the invariant that all pavail[] entries have a non-zero 859 unsigned long paddr, num_pages, start_pfn, end_pfn;
799 * size which is assumed by all of the code in here. 860 struct pglist_data *p;
800 */ 861
801 for (i = 0; i < pavail_ents; i++) { 862#ifdef CONFIG_NEED_MULTIPLE_NODES
802 unsigned long start, end, kern_end; 863 paddr = lmb_alloc_nid(sizeof(struct pglist_data),
803 unsigned long trim_low, trim_high, n; 864 SMP_CACHE_BYTES, nid, nid_range);
865 if (!paddr) {
866 prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid);
867 prom_halt();
868 }
869 NODE_DATA(nid) = __va(paddr);
870 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
804 871
805 kern_end = PAGE_ALIGN(kern_base + kern_size); 872 NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
873#endif
806 874
807 trim_low = start = pavail[i].phys_addr; 875 p = NODE_DATA(nid);
808 trim_high = end = start + pavail[i].reg_size;
809 876
810 if (kern_base >= start && 877 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
811 kern_base < end) { 878 p->node_start_pfn = start_pfn;
812 trim_low = kern_base; 879 p->node_spanned_pages = end_pfn - start_pfn;
813 if (kern_end >= end) 880
814 continue; 881 if (p->node_spanned_pages) {
815 } 882 num_pages = bootmem_bootmap_pages(p->node_spanned_pages);
816 if (kern_end >= start && 883
817 kern_end < end) { 884 paddr = lmb_alloc_nid(num_pages << PAGE_SHIFT, PAGE_SIZE, nid,
818 trim_high = kern_end; 885 nid_range);
819 } 886 if (!paddr) {
820 if (avoid_start && 887 prom_printf("Cannot allocate bootmap for nid[%d]\n",
821 avoid_start >= start && 888 nid);
822 avoid_start < end) { 889 prom_halt();
823 if (trim_low > avoid_start)
824 trim_low = avoid_start;
825 if (avoid_end >= end)
826 continue;
827 }
828 if (avoid_end &&
829 avoid_end >= start &&
830 avoid_end < end) {
831 if (trim_high < avoid_end)
832 trim_high = avoid_end;
833 } 890 }
891 node_masks[nid].bootmem_paddr = paddr;
892 }
893}
894
895static void init_node_masks_nonnuma(void)
896{
897 int i;
898
899 numadbg("Initializing tables for non-numa.\n");
900
901 node_masks[0].mask = node_masks[0].val = 0;
902 num_node_masks = 1;
903
904 for (i = 0; i < NR_CPUS; i++)
905 numa_cpu_lookup_table[i] = 0;
906
907 numa_cpumask_lookup_table[0] = CPU_MASK_ALL;
908}
909
910#ifdef CONFIG_NEED_MULTIPLE_NODES
911struct pglist_data *node_data[MAX_NUMNODES];
912
913EXPORT_SYMBOL(numa_cpu_lookup_table);
914EXPORT_SYMBOL(numa_cpumask_lookup_table);
915EXPORT_SYMBOL(node_data);
916
917struct mdesc_mlgroup {
918 u64 node;
919 u64 latency;
920 u64 match;
921 u64 mask;
922};
923static struct mdesc_mlgroup *mlgroups;
924static int num_mlgroups;
925
926static int scan_pio_for_cfg_handle(struct mdesc_handle *md, u64 pio,
927 u32 cfg_handle)
928{
929 u64 arc;
834 930
835 if (trim_high <= trim_low) 931 mdesc_for_each_arc(arc, md, pio, MDESC_ARC_TYPE_FWD) {
932 u64 target = mdesc_arc_target(md, arc);
933 const u64 *val;
934
935 val = mdesc_get_property(md, target,
936 "cfg-handle", NULL);
937 if (val && *val == cfg_handle)
938 return 0;
939 }
940 return -ENODEV;
941}
942
943static int scan_arcs_for_cfg_handle(struct mdesc_handle *md, u64 grp,
944 u32 cfg_handle)
945{
946 u64 arc, candidate, best_latency = ~(u64)0;
947
948 candidate = MDESC_NODE_NULL;
949 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) {
950 u64 target = mdesc_arc_target(md, arc);
951 const char *name = mdesc_node_name(md, target);
952 const u64 *val;
953
954 if (strcmp(name, "pio-latency-group"))
836 continue; 955 continue;
837 956
838 if (trim_low == start && trim_high == end) { 957 val = mdesc_get_property(md, target, "latency", NULL);
839 /* Whole chunk is available for trimming. 958 if (!val)
840 * Trim all except one page, in order to keep 959 continue;
841 * entry non-empty. 960
842 */ 961 if (*val < best_latency) {
843 n = (end - start) - PAGE_SIZE; 962 candidate = target;
844 if (n > to_trim) 963 best_latency = *val;
845 n = to_trim;
846
847 if (n) {
848 pavail[i].phys_addr += n;
849 pavail[i].reg_size -= n;
850 to_trim -= n;
851 }
852 } else {
853 n = (trim_low - start);
854 if (n > to_trim)
855 n = to_trim;
856
857 if (n) {
858 pavail[i].phys_addr += n;
859 pavail[i].reg_size -= n;
860 to_trim -= n;
861 }
862 if (to_trim) {
863 n = end - trim_high;
864 if (n > to_trim)
865 n = to_trim;
866 if (n) {
867 pavail[i].reg_size -= n;
868 to_trim -= n;
869 }
870 }
871 } 964 }
965 }
966
967 if (candidate == MDESC_NODE_NULL)
968 return -ENODEV;
969
970 return scan_pio_for_cfg_handle(md, candidate, cfg_handle);
971}
972
973int of_node_to_nid(struct device_node *dp)
974{
975 const struct linux_prom64_registers *regs;
976 struct mdesc_handle *md;
977 u32 cfg_handle;
978 int count, nid;
979 u64 grp;
872 980
873 if (!to_trim) 981 if (!mlgroups)
982 return -1;
983
984 regs = of_get_property(dp, "reg", NULL);
985 if (!regs)
986 return -1;
987
988 cfg_handle = (regs->phys_addr >> 32UL) & 0x0fffffff;
989
990 md = mdesc_grab();
991
992 count = 0;
993 nid = -1;
994 mdesc_for_each_node_by_name(md, grp, "group") {
995 if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) {
996 nid = count;
874 break; 997 break;
998 }
999 count++;
875 } 1000 }
876 1001
877 /* Recalculate. */ 1002 mdesc_release(md);
878 *cur_size_p = 0UL; 1003
879 for (i = 0; i < pavail_ents; i++) { 1004 return nid;
880 *end_of_phys_p = pavail[i].phys_addr +
881 pavail[i].reg_size;
882 *cur_size_p += pavail[i].reg_size;
883 }
884} 1005}
885 1006
886/* About pages_avail, this is the value we will use to calculate 1007static void add_node_ranges(void)
887 * the zholes_size[] argument given to free_area_init_node(). The
888 * page allocator uses this to calculate nr_kernel_pages,
889 * nr_all_pages and zone->present_pages. On NUMA it is used
890 * to calculate zone->min_unmapped_pages and zone->min_slab_pages.
891 *
892 * So this number should really be set to what the page allocator
893 * actually ends up with. This means:
894 * 1) It should include bootmem map pages, we'll release those.
895 * 2) It should not include the kernel image, except for the
896 * __init sections which we will also release.
897 * 3) It should include the initrd image, since we'll release
898 * that too.
899 */
900static unsigned long __init bootmem_init(unsigned long *pages_avail,
901 unsigned long phys_base)
902{ 1008{
903 unsigned long bootmap_size, end_pfn;
904 unsigned long end_of_phys_memory = 0UL;
905 unsigned long bootmap_pfn, bytes_avail, size;
906 int i; 1009 int i;
907 1010
908 bytes_avail = 0UL; 1011 for (i = 0; i < lmb.memory.cnt; i++) {
909 for (i = 0; i < pavail_ents; i++) { 1012 unsigned long size = lmb_size_bytes(&lmb.memory, i);
910 end_of_phys_memory = pavail[i].phys_addr + 1013 unsigned long start, end;
911 pavail[i].reg_size; 1014
912 bytes_avail += pavail[i].reg_size; 1015 start = lmb.memory.region[i].base;
1016 end = start + size;
1017 while (start < end) {
1018 unsigned long this_end;
1019 int nid;
1020
1021 this_end = nid_range(start, end, &nid);
1022
1023 numadbg("Adding active range nid[%d] "
1024 "start[%lx] end[%lx]\n",
1025 nid, start, this_end);
1026
1027 add_active_range(nid,
1028 start >> PAGE_SHIFT,
1029 this_end >> PAGE_SHIFT);
1030
1031 start = this_end;
1032 }
913 } 1033 }
1034}
914 1035
915 /* Determine the location of the initial ramdisk before trying 1036static int __init grab_mlgroups(struct mdesc_handle *md)
916 * to honor the "mem=xxx" command line argument. We must know 1037{
917 * where the kernel image and the ramdisk image are so that we 1038 unsigned long paddr;
918 * do not trim those two areas from the physical memory map. 1039 int count = 0;
919 */ 1040 u64 node;
1041
1042 mdesc_for_each_node_by_name(md, node, "memory-latency-group")
1043 count++;
1044 if (!count)
1045 return -ENOENT;
1046
1047 paddr = lmb_alloc(count * sizeof(struct mdesc_mlgroup),
1048 SMP_CACHE_BYTES);
1049 if (!paddr)
1050 return -ENOMEM;
1051
1052 mlgroups = __va(paddr);
1053 num_mlgroups = count;
1054
1055 count = 0;
1056 mdesc_for_each_node_by_name(md, node, "memory-latency-group") {
1057 struct mdesc_mlgroup *m = &mlgroups[count++];
1058 const u64 *val;
1059
1060 m->node = node;
1061
1062 val = mdesc_get_property(md, node, "latency", NULL);
1063 m->latency = *val;
1064 val = mdesc_get_property(md, node, "address-match", NULL);
1065 m->match = *val;
1066 val = mdesc_get_property(md, node, "address-mask", NULL);
1067 m->mask = *val;
1068
1069 numadbg("MLGROUP[%d]: node[%lx] latency[%lx] "
1070 "match[%lx] mask[%lx]\n",
1071 count - 1, m->node, m->latency, m->match, m->mask);
1072 }
920 1073
921#ifdef CONFIG_BLK_DEV_INITRD 1074 return 0;
922 /* Now have to check initial ramdisk, so that bootmap does not overwrite it */ 1075}
923 if (sparc_ramdisk_image || sparc_ramdisk_image64) { 1076
924 unsigned long ramdisk_image = sparc_ramdisk_image ? 1077static int __init grab_mblocks(struct mdesc_handle *md)
925 sparc_ramdisk_image : sparc_ramdisk_image64; 1078{
926 ramdisk_image -= KERNBASE; 1079 unsigned long paddr;
927 initrd_start = ramdisk_image + phys_base; 1080 int count = 0;
928 initrd_end = initrd_start + sparc_ramdisk_size; 1081 u64 node;
929 if (initrd_end > end_of_phys_memory) { 1082
930 printk(KERN_CRIT "initrd extends beyond end of memory " 1083 mdesc_for_each_node_by_name(md, node, "mblock")
931 "(0x%016lx > 0x%016lx)\ndisabling initrd\n", 1084 count++;
932 initrd_end, end_of_phys_memory); 1085 if (!count)
933 initrd_start = 0; 1086 return -ENOENT;
934 initrd_end = 0; 1087
1088 paddr = lmb_alloc(count * sizeof(struct mdesc_mblock),
1089 SMP_CACHE_BYTES);
1090 if (!paddr)
1091 return -ENOMEM;
1092
1093 mblocks = __va(paddr);
1094 num_mblocks = count;
1095
1096 count = 0;
1097 mdesc_for_each_node_by_name(md, node, "mblock") {
1098 struct mdesc_mblock *m = &mblocks[count++];
1099 const u64 *val;
1100
1101 val = mdesc_get_property(md, node, "base", NULL);
1102 m->base = *val;
1103 val = mdesc_get_property(md, node, "size", NULL);
1104 m->size = *val;
1105 val = mdesc_get_property(md, node,
1106 "address-congruence-offset", NULL);
1107 m->offset = *val;
1108
1109 numadbg("MBLOCK[%d]: base[%lx] size[%lx] offset[%lx]\n",
1110 count - 1, m->base, m->size, m->offset);
1111 }
1112
1113 return 0;
1114}
1115
1116static void __init numa_parse_mdesc_group_cpus(struct mdesc_handle *md,
1117 u64 grp, cpumask_t *mask)
1118{
1119 u64 arc;
1120
1121 cpus_clear(*mask);
1122
1123 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_BACK) {
1124 u64 target = mdesc_arc_target(md, arc);
1125 const char *name = mdesc_node_name(md, target);
1126 const u64 *id;
1127
1128 if (strcmp(name, "cpu"))
1129 continue;
1130 id = mdesc_get_property(md, target, "id", NULL);
1131 if (*id < NR_CPUS)
1132 cpu_set(*id, *mask);
1133 }
1134}
1135
1136static struct mdesc_mlgroup * __init find_mlgroup(u64 node)
1137{
1138 int i;
1139
1140 for (i = 0; i < num_mlgroups; i++) {
1141 struct mdesc_mlgroup *m = &mlgroups[i];
1142 if (m->node == node)
1143 return m;
1144 }
1145 return NULL;
1146}
1147
1148static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp,
1149 int index)
1150{
1151 struct mdesc_mlgroup *candidate = NULL;
1152 u64 arc, best_latency = ~(u64)0;
1153 struct node_mem_mask *n;
1154
1155 mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) {
1156 u64 target = mdesc_arc_target(md, arc);
1157 struct mdesc_mlgroup *m = find_mlgroup(target);
1158 if (!m)
1159 continue;
1160 if (m->latency < best_latency) {
1161 candidate = m;
1162 best_latency = m->latency;
935 } 1163 }
936 } 1164 }
937#endif 1165 if (!candidate)
1166 return -ENOENT;
1167
1168 if (num_node_masks != index) {
1169 printk(KERN_ERR "Inconsistent NUMA state, "
1170 "index[%d] != num_node_masks[%d]\n",
1171 index, num_node_masks);
1172 return -EINVAL;
1173 }
938 1174
939 if (cmdline_memory_size && 1175 n = &node_masks[num_node_masks++];
940 bytes_avail > cmdline_memory_size)
941 trim_pavail(&bytes_avail,
942 &end_of_phys_memory);
943 1176
944 *pages_avail = bytes_avail >> PAGE_SHIFT; 1177 n->mask = candidate->mask;
1178 n->val = candidate->match;
945 1179
946 end_pfn = end_of_phys_memory >> PAGE_SHIFT; 1180 numadbg("NUMA NODE[%d]: mask[%lx] val[%lx] (latency[%lx])\n",
1181 index, n->mask, n->val, candidate->latency);
947 1182
948 /* Initialize the boot-time allocator. */ 1183 return 0;
949 max_pfn = max_low_pfn = end_pfn; 1184}
950 min_low_pfn = (phys_base >> PAGE_SHIFT); 1185
1186static int __init numa_parse_mdesc_group(struct mdesc_handle *md, u64 grp,
1187 int index)
1188{
1189 cpumask_t mask;
1190 int cpu;
951 1191
952 bootmap_pfn = choose_bootmap_pfn(min_low_pfn, end_pfn); 1192 numa_parse_mdesc_group_cpus(md, grp, &mask);
953 1193
954 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap_pfn, 1194 for_each_cpu_mask(cpu, mask)
955 min_low_pfn, end_pfn); 1195 numa_cpu_lookup_table[cpu] = index;
1196 numa_cpumask_lookup_table[index] = mask;
956 1197
957 /* Now register the available physical memory with the 1198 if (numa_debug) {
958 * allocator. 1199 printk(KERN_INFO "NUMA GROUP[%d]: cpus [ ", index);
959 */ 1200 for_each_cpu_mask(cpu, mask)
960 for (i = 0; i < pavail_ents; i++) 1201 printk("%d ", cpu);
961 free_bootmem(pavail[i].phys_addr, pavail[i].reg_size); 1202 printk("]\n");
1203 }
962 1204
963#ifdef CONFIG_BLK_DEV_INITRD 1205 return numa_attach_mlgroup(md, grp, index);
964 if (initrd_start) { 1206}
965 size = initrd_end - initrd_start; 1207
1208static int __init numa_parse_mdesc(void)
1209{
1210 struct mdesc_handle *md = mdesc_grab();
1211 int i, err, count;
1212 u64 node;
1213
1214 node = mdesc_node_by_name(md, MDESC_NODE_NULL, "latency-groups");
1215 if (node == MDESC_NODE_NULL) {
1216 mdesc_release(md);
1217 return -ENOENT;
1218 }
1219
1220 err = grab_mblocks(md);
1221 if (err < 0)
1222 goto out;
1223
1224 err = grab_mlgroups(md);
1225 if (err < 0)
1226 goto out;
1227
1228 count = 0;
1229 mdesc_for_each_node_by_name(md, node, "group") {
1230 err = numa_parse_mdesc_group(md, node, count);
1231 if (err < 0)
1232 break;
1233 count++;
1234 }
1235
1236 add_node_ranges();
1237
1238 for (i = 0; i < num_node_masks; i++) {
1239 allocate_node_data(i);
1240 node_set_online(i);
1241 }
1242
1243 err = 0;
1244out:
1245 mdesc_release(md);
1246 return err;
1247}
1248
1249static int __init numa_parse_sun4u(void)
1250{
1251 return -1;
1252}
966 1253
967 /* Reserve the initrd image area. */ 1254static int __init bootmem_init_numa(void)
968 reserve_bootmem(initrd_start, size, BOOTMEM_DEFAULT); 1255{
1256 int err = -1;
969 1257
970 initrd_start += PAGE_OFFSET; 1258 numadbg("bootmem_init_numa()\n");
971 initrd_end += PAGE_OFFSET; 1259
1260 if (numa_enabled) {
1261 if (tlb_type == hypervisor)
1262 err = numa_parse_mdesc();
1263 else
1264 err = numa_parse_sun4u();
972 } 1265 }
1266 return err;
1267}
1268
1269#else
1270
1271static int bootmem_init_numa(void)
1272{
1273 return -1;
1274}
1275
973#endif 1276#endif
974 /* Reserve the kernel text/data/bss. */
975 reserve_bootmem(kern_base, kern_size, BOOTMEM_DEFAULT);
976 *pages_avail -= PAGE_ALIGN(kern_size) >> PAGE_SHIFT;
977
978 /* Add back in the initmem pages. */
979 size = ((unsigned long)(__init_end) & PAGE_MASK) -
980 PAGE_ALIGN((unsigned long)__init_begin);
981 *pages_avail += size >> PAGE_SHIFT;
982
983 /* Reserve the bootmem map. We do not account for it
984 * in pages_avail because we will release that memory
985 * in free_all_bootmem.
986 */
987 size = bootmap_size;
988 reserve_bootmem((bootmap_pfn << PAGE_SHIFT), size, BOOTMEM_DEFAULT);
989 1277
990 for (i = 0; i < pavail_ents; i++) { 1278static void __init bootmem_init_nonnuma(void)
1279{
1280 unsigned long top_of_ram = lmb_end_of_DRAM();
1281 unsigned long total_ram = lmb_phys_mem_size();
1282 unsigned int i;
1283
1284 numadbg("bootmem_init_nonnuma()\n");
1285
1286 printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
1287 top_of_ram, total_ram);
1288 printk(KERN_INFO "Memory hole size: %ldMB\n",
1289 (top_of_ram - total_ram) >> 20);
1290
1291 init_node_masks_nonnuma();
1292
1293 for (i = 0; i < lmb.memory.cnt; i++) {
1294 unsigned long size = lmb_size_bytes(&lmb.memory, i);
991 unsigned long start_pfn, end_pfn; 1295 unsigned long start_pfn, end_pfn;
992 1296
993 start_pfn = pavail[i].phys_addr >> PAGE_SHIFT; 1297 if (!size)
994 end_pfn = (start_pfn + (pavail[i].reg_size >> PAGE_SHIFT)); 1298 continue;
995 memory_present(0, start_pfn, end_pfn); 1299
1300 start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
1301 end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
1302 add_active_range(0, start_pfn, end_pfn);
1303 }
1304
1305 allocate_node_data(0);
1306
1307 node_set_online(0);
1308}
1309
1310static void __init reserve_range_in_node(int nid, unsigned long start,
1311 unsigned long end)
1312{
1313 numadbg(" reserve_range_in_node(nid[%d],start[%lx],end[%lx]\n",
1314 nid, start, end);
1315 while (start < end) {
1316 unsigned long this_end;
1317 int n;
1318
1319 this_end = nid_range(start, end, &n);
1320 if (n == nid) {
1321 numadbg(" MATCH reserving range [%lx:%lx]\n",
1322 start, this_end);
1323 reserve_bootmem_node(NODE_DATA(nid), start,
1324 (this_end - start), BOOTMEM_DEFAULT);
1325 } else
1326 numadbg(" NO MATCH, advancing start to %lx\n",
1327 this_end);
1328
1329 start = this_end;
1330 }
1331}
1332
1333static void __init trim_reserved_in_node(int nid)
1334{
1335 int i;
1336
1337 numadbg(" trim_reserved_in_node(%d)\n", nid);
1338
1339 for (i = 0; i < lmb.reserved.cnt; i++) {
1340 unsigned long start = lmb.reserved.region[i].base;
1341 unsigned long size = lmb_size_bytes(&lmb.reserved, i);
1342 unsigned long end = start + size;
1343
1344 reserve_range_in_node(nid, start, end);
1345 }
1346}
1347
1348static void __init bootmem_init_one_node(int nid)
1349{
1350 struct pglist_data *p;
1351
1352 numadbg("bootmem_init_one_node(%d)\n", nid);
1353
1354 p = NODE_DATA(nid);
1355
1356 if (p->node_spanned_pages) {
1357 unsigned long paddr = node_masks[nid].bootmem_paddr;
1358 unsigned long end_pfn;
1359
1360 end_pfn = p->node_start_pfn + p->node_spanned_pages;
1361
1362 numadbg(" init_bootmem_node(%d, %lx, %lx, %lx)\n",
1363 nid, paddr >> PAGE_SHIFT, p->node_start_pfn, end_pfn);
1364
1365 init_bootmem_node(p, paddr >> PAGE_SHIFT,
1366 p->node_start_pfn, end_pfn);
1367
1368 numadbg(" free_bootmem_with_active_regions(%d, %lx)\n",
1369 nid, end_pfn);
1370 free_bootmem_with_active_regions(nid, end_pfn);
1371
1372 trim_reserved_in_node(nid);
1373
1374 numadbg(" sparse_memory_present_with_active_regions(%d)\n",
1375 nid);
1376 sparse_memory_present_with_active_regions(nid);
996 } 1377 }
1378}
1379
1380static unsigned long __init bootmem_init(unsigned long phys_base)
1381{
1382 unsigned long end_pfn;
1383 int nid;
1384
1385 end_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
1386 max_pfn = max_low_pfn = end_pfn;
1387 min_low_pfn = (phys_base >> PAGE_SHIFT);
1388
1389 if (bootmem_init_numa() < 0)
1390 bootmem_init_nonnuma();
1391
1392 /* XXX cpu notifier XXX */
1393
1394 for_each_online_node(nid)
1395 bootmem_init_one_node(nid);
997 1396
998 sparse_init(); 1397 sparse_init();
999 1398
@@ -1289,7 +1688,7 @@ void __init setup_per_cpu_areas(void)
1289 1688
1290void __init paging_init(void) 1689void __init paging_init(void)
1291{ 1690{
1292 unsigned long end_pfn, pages_avail, shift, phys_base; 1691 unsigned long end_pfn, shift, phys_base;
1293 unsigned long real_end, i; 1692 unsigned long real_end, i;
1294 1693
1295 /* These build time checkes make sure that the dcache_dirty_cpu() 1694 /* These build time checkes make sure that the dcache_dirty_cpu()
@@ -1330,12 +1729,26 @@ void __init paging_init(void)
1330 sun4v_ktsb_init(); 1729 sun4v_ktsb_init();
1331 } 1730 }
1332 1731
1732 lmb_init();
1733
1333 /* Find available physical memory... */ 1734 /* Find available physical memory... */
1334 read_obp_memory("available", &pavail[0], &pavail_ents); 1735 read_obp_memory("available", &pavail[0], &pavail_ents);
1335 1736
1336 phys_base = 0xffffffffffffffffUL; 1737 phys_base = 0xffffffffffffffffUL;
1337 for (i = 0; i < pavail_ents; i++) 1738 for (i = 0; i < pavail_ents; i++) {
1338 phys_base = min(phys_base, pavail[i].phys_addr); 1739 phys_base = min(phys_base, pavail[i].phys_addr);
1740 lmb_add(pavail[i].phys_addr, pavail[i].reg_size);
1741 }
1742
1743 lmb_reserve(kern_base, kern_size);
1744
1745 find_ramdisk(phys_base);
1746
1747 if (cmdline_memory_size)
1748 lmb_enforce_memory_limit(phys_base + cmdline_memory_size);
1749
1750 lmb_analyze();
1751 lmb_dump_all();
1339 1752
1340 set_bit(0, mmu_context_bmap); 1753 set_bit(0, mmu_context_bmap);
1341 1754
@@ -1371,14 +1784,10 @@ void __init paging_init(void)
1371 if (tlb_type == hypervisor) 1784 if (tlb_type == hypervisor)
1372 sun4v_ktsb_register(); 1785 sun4v_ktsb_register();
1373 1786
1374 /* Setup bootmem... */ 1787 /* We must setup the per-cpu areas before we pull in the
1375 pages_avail = 0; 1788 * PROM and the MDESC. The code there fills in cpu and
1376 last_valid_pfn = end_pfn = bootmem_init(&pages_avail, phys_base); 1789 * other information into per-cpu data structures.
1377 1790 */
1378 max_mapnr = last_valid_pfn;
1379
1380 kernel_physical_mapping_init();
1381
1382 real_setup_per_cpu_areas(); 1791 real_setup_per_cpu_areas();
1383 1792
1384 prom_build_devicetree(); 1793 prom_build_devicetree();
@@ -1386,20 +1795,22 @@ void __init paging_init(void)
1386 if (tlb_type == hypervisor) 1795 if (tlb_type == hypervisor)
1387 sun4v_mdesc_init(); 1796 sun4v_mdesc_init();
1388 1797
1798 /* Setup bootmem... */
1799 last_valid_pfn = end_pfn = bootmem_init(phys_base);
1800
1801#ifndef CONFIG_NEED_MULTIPLE_NODES
1802 max_mapnr = last_valid_pfn;
1803#endif
1804 kernel_physical_mapping_init();
1805
1389 { 1806 {
1390 unsigned long zones_size[MAX_NR_ZONES]; 1807 unsigned long max_zone_pfns[MAX_NR_ZONES];
1391 unsigned long zholes_size[MAX_NR_ZONES];
1392 int znum;
1393 1808
1394 for (znum = 0; znum < MAX_NR_ZONES; znum++) 1809 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
1395 zones_size[znum] = zholes_size[znum] = 0;
1396 1810
1397 zones_size[ZONE_NORMAL] = end_pfn; 1811 max_zone_pfns[ZONE_NORMAL] = end_pfn;
1398 zholes_size[ZONE_NORMAL] = end_pfn - pages_avail;
1399 1812
1400 free_area_init_node(0, &contig_page_data, zones_size, 1813 free_area_init_nodes(max_zone_pfns);
1401 __pa(PAGE_OFFSET) >> PAGE_SHIFT,
1402 zholes_size);
1403 } 1814 }
1404 1815
1405 printk("Booting Linux...\n"); 1816 printk("Booting Linux...\n");
@@ -1408,21 +1819,52 @@ void __init paging_init(void)
1408 cpu_probe(); 1819 cpu_probe();
1409} 1820}
1410 1821
1411static void __init taint_real_pages(void) 1822int __init page_in_phys_avail(unsigned long paddr)
1823{
1824 int i;
1825
1826 paddr &= PAGE_MASK;
1827
1828 for (i = 0; i < pavail_ents; i++) {
1829 unsigned long start, end;
1830
1831 start = pavail[i].phys_addr;
1832 end = start + pavail[i].reg_size;
1833
1834 if (paddr >= start && paddr < end)
1835 return 1;
1836 }
1837 if (paddr >= kern_base && paddr < (kern_base + kern_size))
1838 return 1;
1839#ifdef CONFIG_BLK_DEV_INITRD
1840 if (paddr >= __pa(initrd_start) &&
1841 paddr < __pa(PAGE_ALIGN(initrd_end)))
1842 return 1;
1843#endif
1844
1845 return 0;
1846}
1847
1848static struct linux_prom64_registers pavail_rescan[MAX_BANKS] __initdata;
1849static int pavail_rescan_ents __initdata;
1850
1851/* Certain OBP calls, such as fetching "available" properties, can
1852 * claim physical memory. So, along with initializing the valid
1853 * address bitmap, what we do here is refetch the physical available
1854 * memory list again, and make sure it provides at least as much
1855 * memory as 'pavail' does.
1856 */
1857static void setup_valid_addr_bitmap_from_pavail(void)
1412{ 1858{
1413 int i; 1859 int i;
1414 1860
1415 read_obp_memory("available", &pavail_rescan[0], &pavail_rescan_ents); 1861 read_obp_memory("available", &pavail_rescan[0], &pavail_rescan_ents);
1416 1862
1417 /* Find changes discovered in the physmem available rescan and
1418 * reserve the lost portions in the bootmem maps.
1419 */
1420 for (i = 0; i < pavail_ents; i++) { 1863 for (i = 0; i < pavail_ents; i++) {
1421 unsigned long old_start, old_end; 1864 unsigned long old_start, old_end;
1422 1865
1423 old_start = pavail[i].phys_addr; 1866 old_start = pavail[i].phys_addr;
1424 old_end = old_start + 1867 old_end = old_start + pavail[i].reg_size;
1425 pavail[i].reg_size;
1426 while (old_start < old_end) { 1868 while (old_start < old_end) {
1427 int n; 1869 int n;
1428 1870
@@ -1440,7 +1882,16 @@ static void __init taint_real_pages(void)
1440 goto do_next_page; 1882 goto do_next_page;
1441 } 1883 }
1442 } 1884 }
1443 reserve_bootmem(old_start, PAGE_SIZE, BOOTMEM_DEFAULT); 1885
1886 prom_printf("mem_init: Lost memory in pavail\n");
1887 prom_printf("mem_init: OLD start[%lx] size[%lx]\n",
1888 pavail[i].phys_addr,
1889 pavail[i].reg_size);
1890 prom_printf("mem_init: NEW start[%lx] size[%lx]\n",
1891 pavail_rescan[i].phys_addr,
1892 pavail_rescan[i].reg_size);
1893 prom_printf("mem_init: Cannot continue, aborting.\n");
1894 prom_halt();
1444 1895
1445 do_next_page: 1896 do_next_page:
1446 old_start += PAGE_SIZE; 1897 old_start += PAGE_SIZE;
@@ -1448,32 +1899,6 @@ static void __init taint_real_pages(void)
1448 } 1899 }
1449} 1900}
1450 1901
1451int __init page_in_phys_avail(unsigned long paddr)
1452{
1453 int i;
1454
1455 paddr &= PAGE_MASK;
1456
1457 for (i = 0; i < pavail_rescan_ents; i++) {
1458 unsigned long start, end;
1459
1460 start = pavail_rescan[i].phys_addr;
1461 end = start + pavail_rescan[i].reg_size;
1462
1463 if (paddr >= start && paddr < end)
1464 return 1;
1465 }
1466 if (paddr >= kern_base && paddr < (kern_base + kern_size))
1467 return 1;
1468#ifdef CONFIG_BLK_DEV_INITRD
1469 if (paddr >= __pa(initrd_start) &&
1470 paddr < __pa(PAGE_ALIGN(initrd_end)))
1471 return 1;
1472#endif
1473
1474 return 0;
1475}
1476
1477void __init mem_init(void) 1902void __init mem_init(void)
1478{ 1903{
1479 unsigned long codepages, datapages, initpages; 1904 unsigned long codepages, datapages, initpages;
@@ -1496,14 +1921,26 @@ void __init mem_init(void)
1496 addr += PAGE_SIZE; 1921 addr += PAGE_SIZE;
1497 } 1922 }
1498 1923
1499 taint_real_pages(); 1924 setup_valid_addr_bitmap_from_pavail();
1500 1925
1501 high_memory = __va(last_valid_pfn << PAGE_SHIFT); 1926 high_memory = __va(last_valid_pfn << PAGE_SHIFT);
1502 1927
1928#ifdef CONFIG_NEED_MULTIPLE_NODES
1929 for_each_online_node(i) {
1930 if (NODE_DATA(i)->node_spanned_pages != 0) {
1931 totalram_pages +=
1932 free_all_bootmem_node(NODE_DATA(i));
1933 }
1934 }
1935#else
1936 totalram_pages = free_all_bootmem();
1937#endif
1938
1503 /* We subtract one to account for the mem_map_zero page 1939 /* We subtract one to account for the mem_map_zero page
1504 * allocated below. 1940 * allocated below.
1505 */ 1941 */
1506 totalram_pages = num_physpages = free_all_bootmem() - 1; 1942 totalram_pages -= 1;
1943 num_physpages = totalram_pages;
1507 1944
1508 /* 1945 /*
1509 * Set up the zero page, mark it reserved, so that page count 1946 * Set up the zero page, mark it reserved, so that page count
diff --git a/arch/sparc64/mm/tsb.c b/arch/sparc64/mm/tsb.c
index a3e6e4b635b3..fe70c8a557b5 100644
--- a/arch/sparc64/mm/tsb.c
+++ b/arch/sparc64/mm/tsb.c
@@ -321,7 +321,8 @@ retry_tsb_alloc:
321 if (new_size > (PAGE_SIZE * 2)) 321 if (new_size > (PAGE_SIZE * 2))
322 gfp_flags = __GFP_NOWARN | __GFP_NORETRY; 322 gfp_flags = __GFP_NOWARN | __GFP_NORETRY;
323 323
324 new_tsb = kmem_cache_alloc(tsb_caches[new_cache_index], gfp_flags); 324 new_tsb = kmem_cache_alloc_node(tsb_caches[new_cache_index],
325 gfp_flags, numa_node_id());
325 if (unlikely(!new_tsb)) { 326 if (unlikely(!new_tsb)) {
326 /* Not being able to fork due to a high-order TSB 327 /* Not being able to fork due to a high-order TSB
327 * allocation failure is very bad behavior. Just back 328 * allocation failure is very bad behavior. Just back
diff --git a/arch/sparc64/mm/ultra.S b/arch/sparc64/mm/ultra.S
index 2865c105b6a4..e686a67561af 100644
--- a/arch/sparc64/mm/ultra.S
+++ b/arch/sparc64/mm/ultra.S
@@ -476,7 +476,6 @@ xcall_sync_tick:
476#endif 476#endif
477 call smp_synchronize_tick_client 477 call smp_synchronize_tick_client
478 nop 478 nop
479 clr %l6
480 b rtrap_xcall 479 b rtrap_xcall
481 ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1 480 ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1
482 481
@@ -511,7 +510,6 @@ xcall_report_regs:
511#endif 510#endif
512 call __show_regs 511 call __show_regs
513 add %sp, PTREGS_OFF, %o0 512 add %sp, PTREGS_OFF, %o0
514 clr %l6
515 /* Has to be a non-v9 branch due to the large distance. */ 513 /* Has to be a non-v9 branch due to the large distance. */
516 b rtrap_xcall 514 b rtrap_xcall
517 ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1 515 ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1
@@ -576,7 +574,7 @@ __hypervisor_tlb_xcall_error:
576 mov %l4, %o0 574 mov %l4, %o0
577 call hypervisor_tlbop_error_xcall 575 call hypervisor_tlbop_error_xcall
578 mov %l5, %o1 576 mov %l5, %o1
579 ba,a,pt %xcc, rtrap_clr_l6 577 ba,a,pt %xcc, rtrap
580 578
581 .globl __hypervisor_xcall_flush_tlb_mm 579 .globl __hypervisor_xcall_flush_tlb_mm
582__hypervisor_xcall_flush_tlb_mm: /* 21 insns */ 580__hypervisor_xcall_flush_tlb_mm: /* 21 insns */