aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
authorJon Mason <jdmason@us.ibm.com>2006-07-29 15:42:43 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-07-29 23:59:55 -0400
commitd2105b10fe0f460c388fe4e09226313f519d8c00 (patch)
tree59ad2f99eeb124ecea6506801eb7f5c0a0a1395d /arch/x86_64
parent089bbbcb36979166131868a89ca5f4e695d6637d (diff)
[PATCH] x86_64: Calgary IOMMU - Multi-Node NULL pointer dereference fix
Calgary hits a NULL pointer dereference when booting in a multi-chassis NUMA system. See Redhat bugzilla number 198498, found by Konrad Rzeszutek (konradr@redhat.com). There are many issues that had to be resolved to fix this problem. Firstly when I originally wrote the code to handle NUMA systems, I had a large misunderstanding that was not corrected until now. That was that I thought the "number of nodes online" referred to number of physical systems connected. So that if NUMA was disabled, there would only be 1 node and it would only show that node's PCI bus. In reality if NUMA is disabled, the system displays all of the connected chassis as one node but is only ignorant of the delays in accessing main memory. Therefore, references to num_online_nodes() and MAX_NUMNODES are incorrect and need to be set to the maximum number of nodes that can be accessed (which are 8). I created a variable, MAX_NUM_CHASSIS, and set it to 8 to fix this. Secondly, when walking the PCI in detect_calgary, the code only checked the first "slot" when looking to see if a device is present. This will work for most cases, but unfortunately it isn't always the case. In the NUMA MXE drawers, there are USB devices present on the 3rd slot (with slot 1 being empty). So, to work around this, all slots (up to 8) are scanned to see if there are any devices present. Lastly, the bus is being enumerated on large systems in a different way the we originally thought. This throws the ugly logic we had out the window. To more elegantly handle this, I reorganized the kva array to be sparse (which removed the need to have any bus number to kva slot logic in tce.c) and created a secondary space array to contain the bus number to phb mapping. With these changes Calgary boots on an x460 with 4 nodes with and without NUMA enabled. Signed-off-by: Jon Mason <jdmason@us.ibm.com> Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/kernel/pci-calgary.c76
-rw-r--r--arch/x86_64/kernel/tce.c4
2 files changed, 45 insertions, 35 deletions
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 92744abff133..146924ba5df5 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -85,7 +85,8 @@
85#define CSR_AGENT_MASK 0xffe0ffff 85#define CSR_AGENT_MASK 0xffe0ffff
86 86
87#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ 87#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */
88#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * 2) /* max dev->bus->number */ 88#define MAX_NUM_CHASSIS 8 /* max number of chassis */
89#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) /* max dev->bus->number */
89#define PHBS_PER_CALGARY 4 90#define PHBS_PER_CALGARY 4
90 91
91/* register offsets in Calgary's internal register space */ 92/* register offsets in Calgary's internal register space */
@@ -110,7 +111,8 @@ static const unsigned long phb_offsets[] = {
110 0xB000 /* PHB3 */ 111 0xB000 /* PHB3 */
111}; 112};
112 113
113void* tce_table_kva[MAX_NUM_OF_PHBS * MAX_NUMNODES]; 114static char bus_to_phb[MAX_PHB_BUS_NUM];
115void* tce_table_kva[MAX_PHB_BUS_NUM];
114unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; 116unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
115static int translate_empty_slots __read_mostly = 0; 117static int translate_empty_slots __read_mostly = 0;
116static int calgary_detected __read_mostly = 0; 118static int calgary_detected __read_mostly = 0;
@@ -119,7 +121,7 @@ static int calgary_detected __read_mostly = 0;
119 * the bitmap of PHBs the user requested that we disable 121 * the bitmap of PHBs the user requested that we disable
120 * translation on. 122 * translation on.
121 */ 123 */
122static DECLARE_BITMAP(translation_disabled, MAX_NUMNODES * MAX_PHB_BUS_NUM); 124static DECLARE_BITMAP(translation_disabled, MAX_PHB_BUS_NUM);
123 125
124static void tce_cache_blast(struct iommu_table *tbl); 126static void tce_cache_blast(struct iommu_table *tbl);
125 127
@@ -452,7 +454,7 @@ static struct dma_mapping_ops calgary_dma_ops = {
452 454
453static inline int busno_to_phbid(unsigned char num) 455static inline int busno_to_phbid(unsigned char num)
454{ 456{
455 return bus_to_phb(num) % PHBS_PER_CALGARY; 457 return bus_to_phb[num];
456} 458}
457 459
458static inline unsigned long split_queue_offset(unsigned char num) 460static inline unsigned long split_queue_offset(unsigned char num)
@@ -812,7 +814,7 @@ static int __init calgary_init(void)
812 int i, ret = -ENODEV; 814 int i, ret = -ENODEV;
813 struct pci_dev *dev = NULL; 815 struct pci_dev *dev = NULL;
814 816
815 for (i = 0; i < num_online_nodes() * MAX_NUM_OF_PHBS; i++) { 817 for (i = 0; i < MAX_PHB_BUS_NUM; i++) {
816 dev = pci_get_device(PCI_VENDOR_ID_IBM, 818 dev = pci_get_device(PCI_VENDOR_ID_IBM,
817 PCI_DEVICE_ID_IBM_CALGARY, 819 PCI_DEVICE_ID_IBM_CALGARY,
818 dev); 820 dev);
@@ -822,7 +824,7 @@ static int __init calgary_init(void)
822 calgary_init_one_nontraslated(dev); 824 calgary_init_one_nontraslated(dev);
823 continue; 825 continue;
824 } 826 }
825 if (!tce_table_kva[i] && !translate_empty_slots) { 827 if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) {
826 pci_dev_put(dev); 828 pci_dev_put(dev);
827 continue; 829 continue;
828 } 830 }
@@ -842,7 +844,7 @@ error:
842 pci_dev_put(dev); 844 pci_dev_put(dev);
843 continue; 845 continue;
844 } 846 }
845 if (!tce_table_kva[i] && !translate_empty_slots) 847 if (!tce_table_kva[dev->bus->number] && !translate_empty_slots)
846 continue; 848 continue;
847 calgary_disable_translation(dev); 849 calgary_disable_translation(dev);
848 calgary_free_tar(dev); 850 calgary_free_tar(dev);
@@ -876,9 +878,10 @@ static inline int __init determine_tce_table_size(u64 ram)
876void __init detect_calgary(void) 878void __init detect_calgary(void)
877{ 879{
878 u32 val; 880 u32 val;
879 int bus, table_idx; 881 int bus;
880 void *tbl; 882 void *tbl;
881 int detected = 0; 883 int calgary_found = 0;
884 int phb = -1;
882 885
883 /* 886 /*
884 * if the user specified iommu=off or iommu=soft or we found 887 * if the user specified iommu=off or iommu=soft or we found
@@ -889,37 +892,46 @@ void __init detect_calgary(void)
889 892
890 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); 893 specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
891 894
892 for (bus = 0, table_idx = 0; 895 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
893 bus < num_online_nodes() * MAX_PHB_BUS_NUM; 896 int dev;
894 bus++) { 897
898 tce_table_kva[bus] = NULL;
899 bus_to_phb[bus] = -1;
900
895 if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) 901 if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
896 continue; 902 continue;
903
904 /*
905 * There are 4 PHBs per Calgary chip. Set phb to which phb (0-3)
906 * it is connected to releative to the clagary chip.
907 */
908 phb = (phb + 1) % PHBS_PER_CALGARY;
909
897 if (test_bit(bus, translation_disabled)) { 910 if (test_bit(bus, translation_disabled)) {
898 printk(KERN_INFO "Calgary: translation is disabled for " 911 printk(KERN_INFO "Calgary: translation is disabled for "
899 "PHB 0x%x\n", bus); 912 "PHB 0x%x\n", bus);
900 /* skip this phb, don't allocate a tbl for it */ 913 /* skip this phb, don't allocate a tbl for it */
901 tce_table_kva[table_idx] = NULL;
902 table_idx++;
903 continue; 914 continue;
904 } 915 }
905 /* 916 /*
906 * scan the first slot of the PCI bus to see if there 917 * Scan the slots of the PCI bus to see if there is a device present.
907 * are any devices present 918 * The parent bus will be the zero-ith device, so start at 1.
908 */ 919 */
909 val = read_pci_config(bus, 1, 0, 0); 920 for (dev = 1; dev < 8; dev++) {
910 if (val != 0xffffffff || translate_empty_slots) { 921 val = read_pci_config(bus, dev, 0, 0);
911 tbl = alloc_tce_table(); 922 if (val != 0xffffffff || translate_empty_slots) {
912 if (!tbl) 923 tbl = alloc_tce_table();
913 goto cleanup; 924 if (!tbl)
914 detected = 1; 925 goto cleanup;
915 } else 926 tce_table_kva[bus] = tbl;
916 tbl = NULL; 927 bus_to_phb[bus] = phb;
917 928 calgary_found = 1;
918 tce_table_kva[table_idx] = tbl; 929 break;
919 table_idx++; 930 }
931 }
920 } 932 }
921 933
922 if (detected) { 934 if (calgary_found) {
923 iommu_detected = 1; 935 iommu_detected = 1;
924 calgary_detected = 1; 936 calgary_detected = 1;
925 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. " 937 printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. "
@@ -928,9 +940,9 @@ void __init detect_calgary(void)
928 return; 940 return;
929 941
930cleanup: 942cleanup:
931 for (--table_idx; table_idx >= 0; --table_idx) 943 for (--bus; bus >= 0; --bus)
932 if (tce_table_kva[table_idx]) 944 if (tce_table_kva[bus])
933 free_tce_table(tce_table_kva[table_idx]); 945 free_tce_table(tce_table_kva[bus]);
934} 946}
935 947
936int __init calgary_iommu_init(void) 948int __init calgary_iommu_init(void)
@@ -1001,7 +1013,7 @@ static int __init calgary_parse_options(char *p)
1001 if (p == endp) 1013 if (p == endp)
1002 break; 1014 break;
1003 1015
1004 if (bridge < (num_online_nodes() * MAX_PHB_BUS_NUM)) { 1016 if (bridge < MAX_PHB_BUS_NUM) {
1005 printk(KERN_INFO "Calgary: disabling " 1017 printk(KERN_INFO "Calgary: disabling "
1006 "translation for PHB 0x%x\n", bridge); 1018 "translation for PHB 0x%x\n", bridge);
1007 set_bit(bridge, translation_disabled); 1019 set_bit(bridge, translation_disabled);
diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c
index d3a9e79e954c..5530dda3f27a 100644
--- a/arch/x86_64/kernel/tce.c
+++ b/arch/x86_64/kernel/tce.c
@@ -96,7 +96,6 @@ static inline unsigned int table_size_to_number_of_entries(unsigned char size)
96static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) 96static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
97{ 97{
98 unsigned int bitmapsz; 98 unsigned int bitmapsz;
99 unsigned int tce_table_index;
100 unsigned long bmppages; 99 unsigned long bmppages;
101 int ret; 100 int ret;
102 101
@@ -105,8 +104,7 @@ static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
105 /* set the tce table size - measured in entries */ 104 /* set the tce table size - measured in entries */
106 tbl->it_size = table_size_to_number_of_entries(specified_table_size); 105 tbl->it_size = table_size_to_number_of_entries(specified_table_size);
107 106
108 tce_table_index = bus_to_phb(tbl->it_busno); 107 tbl->it_base = (unsigned long)tce_table_kva[dev->bus->number];
109 tbl->it_base = (unsigned long)tce_table_kva[tce_table_index];
110 if (!tbl->it_base) { 108 if (!tbl->it_base) {
111 printk(KERN_ERR "Calgary: iommu_table_setparms: " 109 printk(KERN_ERR "Calgary: iommu_table_setparms: "
112 "no table allocated?!\n"); 110 "no table allocated?!\n");