aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatt Domsch <Matt_Domsch@dell.com>2006-09-29 16:23:23 -0400
committerGreg Kroah-Hartman <gregkh@suse.de>2006-10-18 14:36:12 -0400
commit6b4b78fed47e7380dfe9280b154e8b9bfcd4c86c (patch)
tree9fbf5135f4ba87fc68681bcd8996d509cac6f9e9
parent49c61cca2b6591a28ffa4abb73c718091f569746 (diff)
PCI: optionally sort device lists breadth-first
Problem: New Dell PowerEdge servers have 2 embedded ethernet ports, which are labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and in the printed documentation. Assuming no other add-in ethernet ports in the system, Linux 2.4 kernels name these eth0 and eth1 respectively. Many people have come to expect this naming. Linux 2.6 kernels name these eth1 and eth0 respectively (backwards from expectations). I also have reports that various Sun and HP servers have similar behavior. Root cause: Linux 2.4 kernels walk the pci_devices list, which happens to be sorted in breadth-first order (or pcbios_find_device order on i386, which most often is breadth-first also). 2.6 kernels have both the pci_devices list and the pci_bus_type.klist_devices list, the latter is what is walked at driver load time to match the pci_id tables; this klist happens to be in depth-first order. On systems where, for physical routing reasons, NIC1 appears on a lower bus number than NIC2, but NIC2's bridge is discovered first in the depth-first ordering, NIC2 will be discovered before NIC1. If the list were sorted breadth-first, NIC1 would be discovered before NIC2. A PowerEdge 1955 system has the following topology which easily exhibits the difference between depth-first and breadth-first device lists. -[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub +-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0) +-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1) Other factors, such as device driver load order and the presence of PCI slots at various points in the bus hierarchy further complicate this problem; I'm not trying to solve those here, just restore the device order, and thus basic behavior, that 2.4 kernels had. Solution: The solution can come in multiple steps. Suggested fix #1: kernel Patch below optionally sorts the two device lists into breadth-first ordering to maintain compatibility with 2.4 kernels. It adds two new command line options: pci=bfsort pci=nobfsort to force the sort order, or not, as you wish. It also adds DMI checks for the specific Dell systems which exhibit "backwards" ordering, to make them "right". Suggested fix #2: udev rules from userland Many people also have the expectation that embedded NICs are always discovered before add-in NICs (which this patch does not try to do). Using the PCI IRQ Routing Table provided by system BIOS, it's easy to determine which PCI devices are embedded, or if add-in, which PCI slot they're in. I'm working on a tool that would allow udev to name ethernet devices in ascending embedded, slot 1 .. slot N order, subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it independent of udev as well for those distributions that don't use udev in their installers. Suggested fix #3: system board routing rules One can constrain the system board layout to put NIC1 ahead of NIC2 regardless of breadth-first or depth-first discovery order. This adds a significant level of complexity to board routing, and may not be possible in all instances (witness the above systems from several major manufacturers). I don't want to encourage this particular train of thought too far, at the expense of not doing #1 or #2 above. Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade with 2.6.18. You'll also note I took some liberty and temporarily break the klist abstraction to simplify and speed up the sort algorithm. I think that's both safe and appropriate in this instance. Signed-off-by: Matt Domsch <Matt_Domsch@dell.com> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--arch/i386/pci/common.c59
-rw-r--r--arch/i386/pci/pci.h7
-rw-r--r--drivers/pci/probe.c92
-rw-r--r--include/linux/pci.h1
5 files changed, 162 insertions, 2 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index ff571f9298e0..dd00fd556a60 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1231,6 +1231,11 @@ and is between 256 and 4096 characters. It is defined in the file
1231 machine check when some devices' config space 1231 machine check when some devices' config space
1232 is read. But various workarounds are disabled 1232 is read. But various workarounds are disabled
1233 and some IOMMU drivers will not work. 1233 and some IOMMU drivers will not work.
1234 bfsort Sort PCI devices into breadth-first order.
1235 This sorting is done to get a device
1236 order compatible with older (<= 2.4) kernels.
1237 nobfsort Don't sort PCI devices into breadth-first order.
1238
1234 pcmv= [HW,PCMCIA] BadgePAD 4 1239 pcmv= [HW,PCMCIA] BadgePAD 4
1235 1240
1236 pd. [PARIDE] 1241 pd. [PARIDE]
diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c
index 68bce194e688..6d5ace845e44 100644
--- a/arch/i386/pci/common.c
+++ b/arch/i386/pci/common.c
@@ -20,6 +20,7 @@
20unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | 20unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
21 PCI_PROBE_MMCONF; 21 PCI_PROBE_MMCONF;
22 22
23int pci_bf_sort;
23int pci_routeirq; 24int pci_routeirq;
24int pcibios_last_bus = -1; 25int pcibios_last_bus = -1;
25unsigned long pirq_table_addr; 26unsigned long pirq_table_addr;
@@ -118,6 +119,20 @@ void __devinit pcibios_fixup_bus(struct pci_bus *b)
118} 119}
119 120
120/* 121/*
122 * Only use DMI information to set this if nothing was passed
123 * on the kernel command line (which was parsed earlier).
124 */
125
126static int __devinit set_bf_sort(struct dmi_system_id *d)
127{
128 if (pci_bf_sort == pci_bf_sort_default) {
129 pci_bf_sort = pci_dmi_bf;
130 printk(KERN_INFO "PCI: %s detected, enabling pci=bfsort.\n", d->ident);
131 }
132 return 0;
133}
134
135/*
121 * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus) 136 * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus)
122 */ 137 */
123#ifdef __i386__ 138#ifdef __i386__
@@ -130,11 +145,11 @@ static int __devinit assign_all_busses(struct dmi_system_id *d)
130} 145}
131#endif 146#endif
132 147
148static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
149#ifdef __i386__
133/* 150/*
134 * Laptops which need pci=assign-busses to see Cardbus cards 151 * Laptops which need pci=assign-busses to see Cardbus cards
135 */ 152 */
136static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
137#ifdef __i386__
138 { 153 {
139 .callback = assign_all_busses, 154 .callback = assign_all_busses,
140 .ident = "Samsung X20 Laptop", 155 .ident = "Samsung X20 Laptop",
@@ -144,6 +159,38 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
144 }, 159 },
145 }, 160 },
146#endif /* __i386__ */ 161#endif /* __i386__ */
162 {
163 .callback = set_bf_sort,
164 .ident = "Dell PowerEdge 1950",
165 .matches = {
166 DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
167 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1950"),
168 },
169 },
170 {
171 .callback = set_bf_sort,
172 .ident = "Dell PowerEdge 1955",
173 .matches = {
174 DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
175 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1955"),
176 },
177 },
178 {
179 .callback = set_bf_sort,
180 .ident = "Dell PowerEdge 2900",
181 .matches = {
182 DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
183 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2900"),
184 },
185 },
186 {
187 .callback = set_bf_sort,
188 .ident = "Dell PowerEdge 2950",
189 .matches = {
190 DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
191 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2950"),
192 },
193 },
147 {} 194 {}
148}; 195};
149 196
@@ -189,6 +236,8 @@ static int __init pcibios_init(void)
189 236
190 pcibios_resource_survey(); 237 pcibios_resource_survey();
191 238
239 if (pci_bf_sort >= pci_force_bf)
240 pci_sort_breadthfirst();
192#ifdef CONFIG_PCI_BIOS 241#ifdef CONFIG_PCI_BIOS
193 if ((pci_probe & PCI_BIOS_SORT) && !(pci_probe & PCI_NO_SORT)) 242 if ((pci_probe & PCI_BIOS_SORT) && !(pci_probe & PCI_NO_SORT))
194 pcibios_sort(); 243 pcibios_sort();
@@ -203,6 +252,12 @@ char * __devinit pcibios_setup(char *str)
203 if (!strcmp(str, "off")) { 252 if (!strcmp(str, "off")) {
204 pci_probe = 0; 253 pci_probe = 0;
205 return NULL; 254 return NULL;
255 } else if (!strcmp(str, "bfsort")) {
256 pci_bf_sort = pci_force_bf;
257 return NULL;
258 } else if (!strcmp(str, "nobfsort")) {
259 pci_bf_sort = pci_force_nobf;
260 return NULL;
206 } 261 }
207#ifdef CONFIG_PCI_BIOS 262#ifdef CONFIG_PCI_BIOS
208 else if (!strcmp(str, "bios")) { 263 else if (!strcmp(str, "bios")) {
diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h
index 1814f74569c6..ad065cebd7b9 100644
--- a/arch/i386/pci/pci.h
+++ b/arch/i386/pci/pci.h
@@ -30,6 +30,13 @@
30extern unsigned int pci_probe; 30extern unsigned int pci_probe;
31extern unsigned long pirq_table_addr; 31extern unsigned long pirq_table_addr;
32 32
33enum pci_bf_sort_state {
34 pci_bf_sort_default,
35 pci_force_nobf,
36 pci_force_bf,
37 pci_dmi_bf,
38};
39
33/* pci-i386.c */ 40/* pci-i386.c */
34 41
35extern unsigned int pcibios_max_latency; 42extern unsigned int pcibios_max_latency;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index a3b0a5eb5054..e159d6604494 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1067,3 +1067,95 @@ EXPORT_SYMBOL(pci_scan_bridge);
1067EXPORT_SYMBOL(pci_scan_single_device); 1067EXPORT_SYMBOL(pci_scan_single_device);
1068EXPORT_SYMBOL_GPL(pci_scan_child_bus); 1068EXPORT_SYMBOL_GPL(pci_scan_child_bus);
1069#endif 1069#endif
1070
1071static int __init pci_sort_bf_cmp(const struct pci_dev *a, const struct pci_dev *b)
1072{
1073 if (pci_domain_nr(a->bus) < pci_domain_nr(b->bus)) return -1;
1074 else if (pci_domain_nr(a->bus) > pci_domain_nr(b->bus)) return 1;
1075
1076 if (a->bus->number < b->bus->number) return -1;
1077 else if (a->bus->number > b->bus->number) return 1;
1078
1079 if (a->devfn < b->devfn) return -1;
1080 else if (a->devfn > b->devfn) return 1;
1081
1082 return 0;
1083}
1084
1085/*
1086 * Yes, this forcably breaks the klist abstraction temporarily. It
1087 * just wants to sort the klist, not change reference counts and
1088 * take/drop locks rapidly in the process. It does all this while
1089 * holding the lock for the list, so objects can't otherwise be
1090 * added/removed while we're swizzling.
1091 */
1092static void __init pci_insertion_sort_klist(struct pci_dev *a, struct list_head *list)
1093{
1094 struct list_head *pos;
1095 struct klist_node *n;
1096 struct device *dev;
1097 struct pci_dev *b;
1098
1099 list_for_each(pos, list) {
1100 n = container_of(pos, struct klist_node, n_node);
1101 dev = container_of(n, struct device, knode_bus);
1102 b = to_pci_dev(dev);
1103 if (pci_sort_bf_cmp(a, b) <= 0) {
1104 list_move_tail(&a->dev.knode_bus.n_node, &b->dev.knode_bus.n_node);
1105 return;
1106 }
1107 }
1108 list_move_tail(&a->dev.knode_bus.n_node, list);
1109}
1110
1111static void __init pci_sort_breadthfirst_klist(void)
1112{
1113 LIST_HEAD(sorted_devices);
1114 struct list_head *pos, *tmp;
1115 struct klist_node *n;
1116 struct device *dev;
1117 struct pci_dev *pdev;
1118
1119 spin_lock(&pci_bus_type.klist_devices.k_lock);
1120 list_for_each_safe(pos, tmp, &pci_bus_type.klist_devices.k_list) {
1121 n = container_of(pos, struct klist_node, n_node);
1122 dev = container_of(n, struct device, knode_bus);
1123 pdev = to_pci_dev(dev);
1124 pci_insertion_sort_klist(pdev, &sorted_devices);
1125 }
1126 list_splice(&sorted_devices, &pci_bus_type.klist_devices.k_list);
1127 spin_unlock(&pci_bus_type.klist_devices.k_lock);
1128}
1129
1130static void __init pci_insertion_sort_devices(struct pci_dev *a, struct list_head *list)
1131{
1132 struct pci_dev *b;
1133
1134 list_for_each_entry(b, list, global_list) {
1135 if (pci_sort_bf_cmp(a, b) <= 0) {
1136 list_move_tail(&a->global_list, &b->global_list);
1137 return;
1138 }
1139 }
1140 list_move_tail(&a->global_list, list);
1141}
1142
1143static void __init pci_sort_breadthfirst_devices(void)
1144{
1145 LIST_HEAD(sorted_devices);
1146 struct pci_dev *dev, *tmp;
1147
1148 down_write(&pci_bus_sem);
1149 list_for_each_entry_safe(dev, tmp, &pci_devices, global_list) {
1150 pci_insertion_sort_devices(dev, &sorted_devices);
1151 }
1152 list_splice(&sorted_devices, &pci_devices);
1153 up_write(&pci_bus_sem);
1154}
1155
1156void __init pci_sort_breadthfirst(void)
1157{
1158 pci_sort_breadthfirst_devices();
1159 pci_sort_breadthfirst_klist();
1160}
1161
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 09bf88fc80c5..4689e2a699c0 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -443,6 +443,7 @@ extern void pci_remove_bus(struct pci_bus *b);
443extern void pci_remove_bus_device(struct pci_dev *dev); 443extern void pci_remove_bus_device(struct pci_dev *dev);
444extern void pci_stop_bus_device(struct pci_dev *dev); 444extern void pci_stop_bus_device(struct pci_dev *dev);
445void pci_setup_cardbus(struct pci_bus *bus); 445void pci_setup_cardbus(struct pci_bus *bus);
446extern void pci_sort_breadthfirst(void);
446 447
447/* Generic PCI functions exported to card drivers */ 448/* Generic PCI functions exported to card drivers */
448 449