[PATCH] x86_64: Calgary IOMMU - Multi-Node NULL pointer dereference fix

Calgary hits a NULL pointer dereference when booting in a multi-chassis NUMA system. See Redhat bugzilla number 198498, found by Konrad Rzeszutek (konradr@redhat.com). There are many issues that had to be resolved to fix this problem. Firstly when I originally wrote the code to handle NUMA systems, I had a large misunderstanding that was not corrected until now. That was that I thought the "number of nodes online" referred to number of physical systems connected. So that if NUMA was disabled, there would only be 1 node and it would only show that node's PCI bus. In reality if NUMA is disabled, the system displays all of the connected chassis as one node but is only ignorant of the delays in accessing main memory. Therefore, references to num_online_nodes() and MAX_NUMNODES are incorrect and need to be set to the maximum number of nodes that can be accessed (which are 8). I created a variable, MAX_NUM_CHASSIS, and set it to 8 to fix this. Secondly, when walking the PCI in detect_calgary, the code only checked the first "slot" when looking to see if a device is present. This will work for most cases, but unfortunately it isn't always the case. In the NUMA MXE drawers, there are USB devices present on the 3rd slot (with slot 1 being empty). So, to work around this, all slots (up to 8) are scanned to see if there are any devices present. Lastly, the bus is being enumerated on large systems in a different way the we originally thought. This throws the ugly logic we had out the window. To more elegantly handle this, I reorganized the kva array to be sparse (which removed the need to have any bus number to kva slot logic in tce.c) and created a secondary space array to contain the bus number to phb mapping. With these changes Calgary boots on an x460 with 4 nodes with and without NUMA enabled. Signed-off-by: Jon Mason <jdmason@us.ibm.com> Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Jon Mason <jdmason@us.ibm.com> 2006-07-29 15:42:43 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-07-29 23:59:55 -0400
commit: d2105b10fe0f460c388fe4e09226313f519d8c00 (patch)
tree: 59ad2f99eeb124ecea6506801eb7f5c0a0a1395d /arch
parent: 089bbbcb36979166131868a89ca5f4e695d6637d (diff)
2 files changed, 45 insertions, 35 deletions
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 92744abff133..146924ba5df5 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -85,7 +85,8 @@
 #define CSR_AGENT_MASK          0xffe0ffff
 #define MAX_NUM_OF_PHBS         8 /* how many PHBs in total? */
-#define MAX_PHB_BUS_NUM         (MAX_NUM_OF_PHBS * 2) /* max dev->bus->number */
+#define MAX_NUM_CHASSIS         8 /* max number of chassis */
+#define MAX_PHB_BUS_NUM         (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) /* max dev->bus->number */
 #define PHBS_PER_CALGARY        4
 /* register offsets in Calgary's internal register space */
@@ -110,7 +111,8 @@ static const unsigned long phb_offsets[] = {
        0xB000 /* PHB3 */
 };
-void* tce_table_kva[MAX_NUM_OF_PHBS * MAX_NUMNODES];
+static char bus_to_phb[MAX_PHB_BUS_NUM];
+void* tce_table_kva[MAX_PHB_BUS_NUM];
 unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
 static int translate_empty_slots __read_mostly = 0;
 static int calgary_detected __read_mostly = 0;
@@ -119,7 +121,7 @@ static int calgary_detected __read_mostly = 0;
 * the bitmap of PHBs the user requested that we disable
 * translation on.
 */
-static DECLARE_BITMAP(translation_disabled, MAX_NUMNODES * MAX_PHB_BUS_NUM);
+static DECLARE_BITMAP(translation_disabled, MAX_PHB_BUS_NUM);
 static void tce_cache_blast(struct iommu_table *tbl);
@@ -452,7 +454,7 @@ static struct dma_mapping_ops calgary_dma_ops = {
 static inline int busno_to_phbid(unsigned char num)
 {
-        return bus_to_phb(num) % PHBS_PER_CALGARY;
+        return bus_to_phb[num];
 }
 static inline unsigned long split_queue_offset(unsigned char num)
@@ -812,7 +814,7 @@ static int __init calgary_init(void)
        int i, ret = -ENODEV;
        struct pci_dev *dev = NULL;
-        for (i = 0; i < num_online_nodes() * MAX_NUM_OF_PHBS; i++) {
+        for (i = 0; i < MAX_PHB_BUS_NUM; i++) {
                dev = pci_get_device(PCI_VENDOR_ID_IBM,
                                     PCI_DEVICE_ID_IBM_CALGARY,
                                     dev);
@@ -822,7 +824,7 @@ static int __init calgary_init(void)
                        calgary_init_one_nontraslated(dev);
                        continue;
                }
-                if (!tce_table_kva[i] && !translate_empty_slots) {
+                if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) {
                        pci_dev_put(dev);
                        continue;
                }
@@ -842,7 +844,7 @@ error:
                        pci_dev_put(dev);
                        continue;
                }
-                if (!tce_table_kva[i] && !translate_empty_slots)
+                if (!tce_table_kva[dev->bus->number] && !translate_empty_slots)
                        continue;
                calgary_disable_translation(dev);
                calgary_free_tar(dev);
@@ -876,9 +878,10 @@ static inline int __init determine_tce_table_size(u64 ram)
 void __init detect_calgary(void)
 {
        u32 val;
-        int bus, table_idx;
+        int bus;
        void *tbl;
-        int detected = 0;
+        int calgary_found = 0;
+        int phb = -1;
        /*
         * if the user specified iommu=off or iommu=soft or we found
@@ -889,37 +892,46 @@ void __init detect_calgary(void)
        specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
-        for (bus = 0, table_idx = 0;
+        for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
-             bus < num_online_nodes() * MAX_PHB_BUS_NUM;
+                int dev;
-             bus++) {
+                tce_table_kva[bus] = NULL;
+                bus_to_phb[bus] = -1;
                if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
                        continue;
+                /*
+                 * There are 4 PHBs per Calgary chip.  Set phb to which phb (0-3)
+                 * it is connected to releative to the clagary chip.
+                 */
+                phb = (phb + 1) % PHBS_PER_CALGARY;
                if (test_bit(bus, translation_disabled)) {
                        printk(KERN_INFO "Calgary: translation is disabled for "
                               "PHB 0x%x\n", bus);
                        /* skip this phb, don't allocate a tbl for it */
-                        tce_table_kva[table_idx] = NULL;
-                        table_idx++;
                        continue;
                }
                /*
-                 * scan the first slot of the PCI bus to see if there
+                 * Scan the slots of the PCI bus to see if there is a device present.
-                 * are any devices present
+                 * The parent bus will be the zero-ith device, so start at 1.
                 */
-                val = read_pci_config(bus, 1, 0, 0);
+                for (dev = 1; dev < 8; dev++) {
-                if (val != 0xffffffff || translate_empty_slots) {
+                        val = read_pci_config(bus, dev, 0, 0);
-                        tbl = alloc_tce_table();
+                        if (val != 0xffffffff || translate_empty_slots) {
-                        if (!tbl)
+                                tbl = alloc_tce_table();
-                                goto cleanup;
+                                if (!tbl)
-                        detected = 1;
+                                        goto cleanup;
-                } else
+                                tce_table_kva[bus] = tbl;
-                        tbl = NULL;
+                                bus_to_phb[bus] = phb;
+                                calgary_found = 1;
-                tce_table_kva[table_idx] = tbl;
+                                break;
-                table_idx++;
+                        }
+                }
        }
-        if (detected) {
+        if (calgary_found) {
                iommu_detected = 1;
                calgary_detected = 1;
                printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. "
@@ -928,9 +940,9 @@ void __init detect_calgary(void)
        return;
 cleanup:
-        for (--table_idx; table_idx >= 0; --table_idx)
+        for (--bus; bus >= 0; --bus)
-                if (tce_table_kva[table_idx])
+                if (tce_table_kva[bus])
-                        free_tce_table(tce_table_kva[table_idx]);
+                        free_tce_table(tce_table_kva[bus]);
 }
 int __init calgary_iommu_init(void)
@@ -1001,7 +1013,7 @@ static int __init calgary_parse_options(char *p)
                        if (p == endp)
                                break;
-                        if (bridge < (num_online_nodes() * MAX_PHB_BUS_NUM)) {
+                        if (bridge < MAX_PHB_BUS_NUM) {
                                printk(KERN_INFO "Calgary: disabling "
                                       "translation for PHB 0x%x\n", bridge);
                                set_bit(bridge, translation_disabled);
diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c
index d3a9e79e954c..5530dda3f27a 100644
--- a/arch/x86_64/kernel/tce.c
+++ b/arch/x86_64/kernel/tce.c
@@ -96,7 +96,6 @@ static inline unsigned int table_size_to_number_of_entries(unsigned char size)
 static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
 {
        unsigned int bitmapsz;
-        unsigned int tce_table_index;
        unsigned long bmppages;
        int ret;
@@ -105,8 +104,7 @@ static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
        /* set the tce table size - measured in entries */
        tbl->it_size = table_size_to_number_of_entries(specified_table_size);
-        tce_table_index = bus_to_phb(tbl->it_busno);
+        tbl->it_base = (unsigned long)tce_table_kva[dev->bus->number];
-        tbl->it_base = (unsigned long)tce_table_kva[tce_table_index];
        if (!tbl->it_base) {
                printk(KERN_ERR "Calgary: iommu_table_setparms: "
                       "no table allocated?!\n");
author	Jon Mason <jdmason@us.ibm.com>	2006-07-29 15:42:43 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-07-29 23:59:55 -0400
commit	d2105b10fe0f460c388fe4e09226313f519d8c00 (patch)
tree	59ad2f99eeb124ecea6506801eb7f5c0a0a1395d /arch
parent	089bbbcb36979166131868a89ca5f4e695d6637d (diff)

diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 92744abff133..146924ba5df5 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c
@@ -85,7 +85,8 @@
85	#define CSR_AGENT_MASK 0xffe0ffff	85	#define CSR_AGENT_MASK 0xffe0ffff
86		86
87	#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */	87	#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */
88	#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * 2) /* max dev->bus->number */	88	#define MAX_NUM_CHASSIS 8 /* max number of chassis */
		89	#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) /* max dev->bus->number */
89	#define PHBS_PER_CALGARY 4	90	#define PHBS_PER_CALGARY 4
90		91
91	/* register offsets in Calgary's internal register space */	92	/* register offsets in Calgary's internal register space */
@@ -110,7 +111,8 @@ static const unsigned long phb_offsets[] = {
110	0xB000 /* PHB3 */	111	0xB000 /* PHB3 */
111	};	112	};
112		113
113	void* tce_table_kva[MAX_NUM_OF_PHBS * MAX_NUMNODES];	114	static char bus_to_phb[MAX_PHB_BUS_NUM];
		115	void* tce_table_kva[MAX_PHB_BUS_NUM];
114	unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;	116	unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
115	static int translate_empty_slots __read_mostly = 0;	117	static int translate_empty_slots __read_mostly = 0;
116	static int calgary_detected __read_mostly = 0;	118	static int calgary_detected __read_mostly = 0;
@@ -119,7 +121,7 @@ static int calgary_detected __read_mostly = 0;
119	* the bitmap of PHBs the user requested that we disable	121	* the bitmap of PHBs the user requested that we disable
120	* translation on.	122	* translation on.
121	*/	123	*/
122	static DECLARE_BITMAP(translation_disabled, MAX_NUMNODES * MAX_PHB_BUS_NUM);	124	static DECLARE_BITMAP(translation_disabled, MAX_PHB_BUS_NUM);
123		125
124	static void tce_cache_blast(struct iommu_table *tbl);	126	static void tce_cache_blast(struct iommu_table *tbl);
125		127
@@ -452,7 +454,7 @@ static struct dma_mapping_ops calgary_dma_ops = {
452		454
453	static inline int busno_to_phbid(unsigned char num)	455	static inline int busno_to_phbid(unsigned char num)
454	{	456	{
455	return bus_to_phb(num) % PHBS_PER_CALGARY;	457	return bus_to_phb[num];
456	}	458	}
457		459
458	static inline unsigned long split_queue_offset(unsigned char num)	460	static inline unsigned long split_queue_offset(unsigned char num)
@@ -812,7 +814,7 @@ static int __init calgary_init(void)
812	int i, ret = -ENODEV;	814	int i, ret = -ENODEV;
813	struct pci_dev *dev = NULL;	815	struct pci_dev *dev = NULL;
814		816
815	for (i = 0; i < num_online_nodes() * MAX_NUM_OF_PHBS; i++) {	817	for (i = 0; i < MAX_PHB_BUS_NUM; i++) {
816	dev = pci_get_device(PCI_VENDOR_ID_IBM,	818	dev = pci_get_device(PCI_VENDOR_ID_IBM,
817	PCI_DEVICE_ID_IBM_CALGARY,	819	PCI_DEVICE_ID_IBM_CALGARY,
818	dev);	820	dev);
@@ -822,7 +824,7 @@ static int __init calgary_init(void)
822	calgary_init_one_nontraslated(dev);	824	calgary_init_one_nontraslated(dev);
823	continue;	825	continue;
824	}	826	}
825	if (!tce_table_kva[i] && !translate_empty_slots) {	827	if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) {
826	pci_dev_put(dev);	828	pci_dev_put(dev);
827	continue;	829	continue;
828	}	830	}
@@ -842,7 +844,7 @@ error:
842	pci_dev_put(dev);	844	pci_dev_put(dev);
843	continue;	845	continue;
844	}	846	}
845	if (!tce_table_kva[i] && !translate_empty_slots)	847	if (!tce_table_kva[dev->bus->number] && !translate_empty_slots)
846	continue;	848	continue;
847	calgary_disable_translation(dev);	849	calgary_disable_translation(dev);
848	calgary_free_tar(dev);	850	calgary_free_tar(dev);
@@ -876,9 +878,10 @@ static inline int __init determine_tce_table_size(u64 ram)
876	void __init detect_calgary(void)	878	void __init detect_calgary(void)
877	{	879	{
878	u32 val;	880	u32 val;
879	int bus, table_idx;	881	int bus;
880	void *tbl;	882	void *tbl;
881	int detected = 0;	883	int calgary_found = 0;
		884	int phb = -1;
882		885
883	/*	886	/*
884	* if the user specified iommu=off or iommu=soft or we found	887	* if the user specified iommu=off or iommu=soft or we found
@@ -889,37 +892,46 @@ void __init detect_calgary(void)
889		892
890	specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);	893	specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
891		894
892	for (bus = 0, table_idx = 0;	895	for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
893	bus < num_online_nodes() * MAX_PHB_BUS_NUM;	896	int dev;
894	bus++) {	897
		898	tce_table_kva[bus] = NULL;
		899	bus_to_phb[bus] = -1;
		900
895	if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)	901	if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
896	continue;	902	continue;
		903
		904	/*
		905	* There are 4 PHBs per Calgary chip. Set phb to which phb (0-3)
		906	* it is connected to releative to the clagary chip.
		907	*/
		908	phb = (phb + 1) % PHBS_PER_CALGARY;
		909
897	if (test_bit(bus, translation_disabled)) {	910	if (test_bit(bus, translation_disabled)) {
898	printk(KERN_INFO "Calgary: translation is disabled for "	911	printk(KERN_INFO "Calgary: translation is disabled for "
899	"PHB 0x%x\n", bus);	912	"PHB 0x%x\n", bus);
900	/* skip this phb, don't allocate a tbl for it */	913	/* skip this phb, don't allocate a tbl for it */
901	tce_table_kva[table_idx] = NULL;
902	table_idx++;
903	continue;	914	continue;
904	}	915	}
905	/*	916	/*
906	* scan the first slot of the PCI bus to see if there	917	* Scan the slots of the PCI bus to see if there is a device present.
907	* are any devices present	918	* The parent bus will be the zero-ith device, so start at 1.
908	*/	919	*/
909	val = read_pci_config(bus, 1, 0, 0);	920	for (dev = 1; dev < 8; dev++) {
910	if (val != 0xffffffff \|\| translate_empty_slots) {	921	val = read_pci_config(bus, dev, 0, 0);
911	tbl = alloc_tce_table();	922	if (val != 0xffffffff \|\| translate_empty_slots) {
912	if (!tbl)	923	tbl = alloc_tce_table();
913	goto cleanup;	924	if (!tbl)
914	detected = 1;	925	goto cleanup;
915	} else	926	tce_table_kva[bus] = tbl;
916	tbl = NULL;	927	bus_to_phb[bus] = phb;
917		928	calgary_found = 1;
918	tce_table_kva[table_idx] = tbl;	929	break;
919	table_idx++;	930	}
		931	}
920	}	932	}
921		933
922	if (detected) {	934	if (calgary_found) {
923	iommu_detected = 1;	935	iommu_detected = 1;
924	calgary_detected = 1;	936	calgary_detected = 1;
925	printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. "	937	printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. "
@@ -928,9 +940,9 @@ void __init detect_calgary(void)
928	return;	940	return;
929		941
930	cleanup:	942	cleanup:
931	for (--table_idx; table_idx >= 0; --table_idx)	943	for (--bus; bus >= 0; --bus)
932	if (tce_table_kva[table_idx])	944	if (tce_table_kva[bus])
933	free_tce_table(tce_table_kva[table_idx]);	945	free_tce_table(tce_table_kva[bus]);
934	}	946	}
935		947
936	int __init calgary_iommu_init(void)	948	int __init calgary_iommu_init(void)
@@ -1001,7 +1013,7 @@ static int __init calgary_parse_options(char *p)
1001	if (p == endp)	1013	if (p == endp)
1002	break;	1014	break;
1003		1015
1004	if (bridge < (num_online_nodes() * MAX_PHB_BUS_NUM)) {	1016	if (bridge < MAX_PHB_BUS_NUM) {
1005	printk(KERN_INFO "Calgary: disabling "	1017	printk(KERN_INFO "Calgary: disabling "
1006	"translation for PHB 0x%x\n", bridge);	1018	"translation for PHB 0x%x\n", bridge);
1007	set_bit(bridge, translation_disabled);	1019	set_bit(bridge, translation_disabled);


diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c index d3a9e79e954c..5530dda3f27a 100644 --- a/arch/x86_64/kernel/tce.c +++ b/arch/x86_64/kernel/tce.c
@@ -96,7 +96,6 @@ static inline unsigned int table_size_to_number_of_entries(unsigned char size)
96	static int tce_table_setparms(struct pci_dev dev, struct iommu_table tbl)	96	static int tce_table_setparms(struct pci_dev dev, struct iommu_table tbl)
97	{	97	{
98	unsigned int bitmapsz;	98	unsigned int bitmapsz;
99	unsigned int tce_table_index;
100	unsigned long bmppages;	99	unsigned long bmppages;
101	int ret;	100	int ret;
102		101
@@ -105,8 +104,7 @@ static int tce_table_setparms(struct pci_dev dev, struct iommu_table tbl)
105	/* set the tce table size - measured in entries */	104	/* set the tce table size - measured in entries */
106	tbl->it_size = table_size_to_number_of_entries(specified_table_size);	105	tbl->it_size = table_size_to_number_of_entries(specified_table_size);
107		106
108	tce_table_index = bus_to_phb(tbl->it_busno);	107	tbl->it_base = (unsigned long)tce_table_kva[dev->bus->number];
109	tbl->it_base = (unsigned long)tce_table_kva[tce_table_index];
110	if (!tbl->it_base) {	108	if (!tbl->it_base) {
111	printk(KERN_ERR "Calgary: iommu_table_setparms: "	109	printk(KERN_ERR "Calgary: iommu_table_setparms: "
112	"no table allocated?!\n");	110	"no table allocated?!\n");