aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Hansen <dave@linux.vnet.ibm.com>2008-11-24 07:02:35 -0500
committerPaul Mackerras <paulus@samba.org>2008-11-30 17:40:18 -0500
commit4a6186696e7f15b3ea4dafcdb64ee0703e0e4487 (patch)
tree7063a21064917efbc31f6115336cdc661b658187
parent4b824de9b18b8d1013e9fc9e4b0f855ced8cac2c (diff)
powerpc: Fix boot freeze on machine with empty memory node
I got a bug report about a distro kernel not booting on a particular machine. It would freeze during boot: > ... > Could not find start_pfn for node 1 > [boot]0015 Setup Done > Built 2 zonelists in Node order, mobility grouping on. Total pages: 123783 > Policy zone: DMA > Kernel command line: > [boot]0020 XICS Init > [boot]0021 XICS Done > PID hash table entries: 4096 (order: 12, 32768 bytes) > clocksource: timebase mult[7d0000] shift[22] registered > Console: colour dummy device 80x25 > console handover: boot [udbg0] -> real [hvc0] > Dentry cache hash table entries: 1048576 (order: 7, 8388608 bytes) > Inode-cache hash table entries: 524288 (order: 6, 4194304 bytes) > freeing bootmem node 0 I've reproduced this on 2.6.27.7. It is caused by commit 8f64e1f2d1e09267ac926e15090fd505c1c0cbcb ("powerpc: Reserve in bootmem lmb reserved regions that cross NUMA nodes"). The problem is that Jon took a loop which was (in pseudocode): for_each_node(nid) NODE_DATA(nid) = careful_alloc(nid); setup_bootmem(nid); reserve_node_bootmem(nid); and broke it up into: for_each_node(nid) NODE_DATA(nid) = careful_alloc(nid); setup_bootmem(nid); for_each_node(nid) reserve_node_bootmem(nid); The issue comes in when the 'careful_alloc()' is called on a node with no memory. It falls back to using bootmem from a previously-initialized node. But, bootmem has not yet been reserved when Jon's patch is applied. It gives back bogus memory (0xc000000000000000) and pukes later in boot. The following patch collapses the loop back together. It also breaks the mark_reserved_regions_for_nid() code out into a function and adds some comments. I think a huge part of introducing this bug is because for loop was too long and hard to read. The actual bug fix here is the: + if (end_pfn <= node->node_start_pfn || + start_pfn >= node_end_pfn) + continue; Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com> Signed-off-by: Paul Mackerras <paulus@samba.org>
-rw-r--r--arch/powerpc/mm/numa.c122
1 files changed, 75 insertions, 47 deletions
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index eb505ad34a85..a8397bbad3d4 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -865,6 +865,67 @@ static struct notifier_block __cpuinitdata ppc64_numa_nb = {
865 .priority = 1 /* Must run before sched domains notifier. */ 865 .priority = 1 /* Must run before sched domains notifier. */
866}; 866};
867 867
868static void mark_reserved_regions_for_nid(int nid)
869{
870 struct pglist_data *node = NODE_DATA(nid);
871 int i;
872
873 for (i = 0; i < lmb.reserved.cnt; i++) {
874 unsigned long physbase = lmb.reserved.region[i].base;
875 unsigned long size = lmb.reserved.region[i].size;
876 unsigned long start_pfn = physbase >> PAGE_SHIFT;
877 unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);
878 struct node_active_region node_ar;
879 unsigned long node_end_pfn = node->node_start_pfn +
880 node->node_spanned_pages;
881
882 /*
883 * Check to make sure that this lmb.reserved area is
884 * within the bounds of the node that we care about.
885 * Checking the nid of the start and end points is not
886 * sufficient because the reserved area could span the
887 * entire node.
888 */
889 if (end_pfn <= node->node_start_pfn ||
890 start_pfn >= node_end_pfn)
891 continue;
892
893 get_node_active_region(start_pfn, &node_ar);
894 while (start_pfn < end_pfn &&
895 node_ar.start_pfn < node_ar.end_pfn) {
896 unsigned long reserve_size = size;
897 /*
898 * if reserved region extends past active region
899 * then trim size to active region
900 */
901 if (end_pfn > node_ar.end_pfn)
902 reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
903 - (start_pfn << PAGE_SHIFT);
904 dbg("reserve_bootmem %lx %lx nid=%d\n", physbase,
905 reserve_size, node_ar.nid);
906 reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase,
907 reserve_size, BOOTMEM_DEFAULT);
908 /*
909 * if reserved region is contained in the active region
910 * then done.
911 */
912 if (end_pfn <= node_ar.end_pfn)
913 break;
914
915 /*
916 * reserved region extends past the active region
917 * get next active region that contains this
918 * reserved region
919 */
920 start_pfn = node_ar.end_pfn;
921 physbase = start_pfn << PAGE_SHIFT;
922 size = size - reserve_size;
923 get_node_active_region(start_pfn, &node_ar);
924 }
925 }
926}
927
928
868void __init do_init_bootmem(void) 929void __init do_init_bootmem(void)
869{ 930{
870 int nid; 931 int nid;
@@ -890,7 +951,13 @@ void __init do_init_bootmem(void)
890 951
891 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 952 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
892 953
893 /* Allocate the node structure node local if possible */ 954 /*
955 * Allocate the node structure node local if possible
956 *
957 * Be careful moving this around, as it relies on all
958 * previous nodes' bootmem to be initialized and have
959 * all reserved areas marked.
960 */
894 NODE_DATA(nid) = careful_allocation(nid, 961 NODE_DATA(nid) = careful_allocation(nid,
895 sizeof(struct pglist_data), 962 sizeof(struct pglist_data),
896 SMP_CACHE_BYTES, end_pfn); 963 SMP_CACHE_BYTES, end_pfn);
@@ -922,53 +989,14 @@ void __init do_init_bootmem(void)
922 start_pfn, end_pfn); 989 start_pfn, end_pfn);
923 990
924 free_bootmem_with_active_regions(nid, end_pfn); 991 free_bootmem_with_active_regions(nid, end_pfn);
925 } 992 /*
926 993 * Be very careful about moving this around. Future
927 /* Mark reserved regions */ 994 * calls to careful_allocation() depend on this getting
928 for (i = 0; i < lmb.reserved.cnt; i++) { 995 * done correctly.
929 unsigned long physbase = lmb.reserved.region[i].base; 996 */
930 unsigned long size = lmb.reserved.region[i].size; 997 mark_reserved_regions_for_nid(nid);
931 unsigned long start_pfn = physbase >> PAGE_SHIFT;
932 unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);
933 struct node_active_region node_ar;
934
935 get_node_active_region(start_pfn, &node_ar);
936 while (start_pfn < end_pfn &&
937 node_ar.start_pfn < node_ar.end_pfn) {
938 unsigned long reserve_size = size;
939 /*
940 * if reserved region extends past active region
941 * then trim size to active region
942 */
943 if (end_pfn > node_ar.end_pfn)
944 reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
945 - (start_pfn << PAGE_SHIFT);
946 dbg("reserve_bootmem %lx %lx nid=%d\n", physbase,
947 reserve_size, node_ar.nid);
948 reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase,
949 reserve_size, BOOTMEM_DEFAULT);
950 /*
951 * if reserved region is contained in the active region
952 * then done.
953 */
954 if (end_pfn <= node_ar.end_pfn)
955 break;
956
957 /*
958 * reserved region extends past the active region
959 * get next active region that contains this
960 * reserved region
961 */
962 start_pfn = node_ar.end_pfn;
963 physbase = start_pfn << PAGE_SHIFT;
964 size = size - reserve_size;
965 get_node_active_region(start_pfn, &node_ar);
966 }
967
968 }
969
970 for_each_online_node(nid)
971 sparse_memory_present_with_active_regions(nid); 998 sparse_memory_present_with_active_regions(nid);
999 }
972} 1000}
973 1001
974void __init paging_init(void) 1002void __init paging_init(void)