aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/pci
diff options
context:
space:
mode:
authorJon Mason <mason@myri.com>2011-07-20 16:20:54 -0400
committerJesse Barnes <jbarnes@virtuousgeek.org>2011-08-01 14:49:16 -0400
commitb03e7495a862b028294f59fc87286d6d78ee7fa1 (patch)
tree836fbfc2b0e34f034cb273c4d065baba3a65178c /drivers/pci
parent5f66d2b58ca879e70740c82422354144845d6dd3 (diff)
PCI: Set PCI-E Max Payload Size on fabric
On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Diffstat (limited to 'drivers/pci')
-rw-r--r--drivers/pci/hotplug/pcihp_slot.c45
-rw-r--r--drivers/pci/pci.c67
-rw-r--r--drivers/pci/probe.c145
3 files changed, 213 insertions, 44 deletions
diff --git a/drivers/pci/hotplug/pcihp_slot.c b/drivers/pci/hotplug/pcihp_slot.c
index 749fdf070319..753b21aaea61 100644
--- a/drivers/pci/hotplug/pcihp_slot.c
+++ b/drivers/pci/hotplug/pcihp_slot.c
@@ -158,47 +158,6 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp)
158 */ 158 */
159} 159}
160 160
161/* Program PCIE MaxPayload setting on device: ensure parent maxpayload <= device */
162static int pci_set_payload(struct pci_dev *dev)
163{
164 int pos, ppos;
165 u16 pctl, psz;
166 u16 dctl, dsz, dcap, dmax;
167 struct pci_dev *parent;
168
169 parent = dev->bus->self;
170 pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
171 if (!pos)
172 return 0;
173
174 /* Read Device MaxPayload capability and setting */
175 pci_read_config_word(dev, pos + PCI_EXP_DEVCTL, &dctl);
176 pci_read_config_word(dev, pos + PCI_EXP_DEVCAP, &dcap);
177 dsz = (dctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5;
178 dmax = (dcap & PCI_EXP_DEVCAP_PAYLOAD);
179
180 /* Read Parent MaxPayload setting */
181 ppos = pci_find_capability(parent, PCI_CAP_ID_EXP);
182 if (!ppos)
183 return 0;
184 pci_read_config_word(parent, ppos + PCI_EXP_DEVCTL, &pctl);
185 psz = (pctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5;
186
187 /* If parent payload > device max payload -> error
188 * If parent payload > device payload -> set speed
189 * If parent payload <= device payload -> do nothing
190 */
191 if (psz > dmax)
192 return -1;
193 else if (psz > dsz) {
194 dev_info(&dev->dev, "Setting MaxPayload to %d\n", 128 << psz);
195 pci_write_config_word(dev, pos + PCI_EXP_DEVCTL,
196 (dctl & ~PCI_EXP_DEVCTL_PAYLOAD) +
197 (psz << 5));
198 }
199 return 0;
200}
201
202void pci_configure_slot(struct pci_dev *dev) 161void pci_configure_slot(struct pci_dev *dev)
203{ 162{
204 struct pci_dev *cdev; 163 struct pci_dev *cdev;
@@ -210,9 +169,7 @@ void pci_configure_slot(struct pci_dev *dev)
210 (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI))) 169 (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI)))
211 return; 170 return;
212 171
213 ret = pci_set_payload(dev); 172 pcie_bus_configure_settings(dev->bus, dev->bus->self->pcie_mpss);
214 if (ret)
215 dev_warn(&dev->dev, "could not set device max payload\n");
216 173
217 memset(&hpp, 0, sizeof(hpp)); 174 memset(&hpp, 0, sizeof(hpp));
218 ret = pci_get_hp_params(dev, &hpp); 175 ret = pci_get_hp_params(dev, &hpp);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 08a95b369d85..466fad6e6ee2 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -77,6 +77,8 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE;
77unsigned long pci_hotplug_io_size = DEFAULT_HOTPLUG_IO_SIZE; 77unsigned long pci_hotplug_io_size = DEFAULT_HOTPLUG_IO_SIZE;
78unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE; 78unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE;
79 79
80enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_PERFORMANCE;
81
80/* 82/*
81 * The default CLS is used if arch didn't set CLS explicitly and not 83 * The default CLS is used if arch didn't set CLS explicitly and not
82 * all pci devices agree on the same value. Arch can override either 84 * all pci devices agree on the same value. Arch can override either
@@ -3223,6 +3225,67 @@ out:
3223EXPORT_SYMBOL(pcie_set_readrq); 3225EXPORT_SYMBOL(pcie_set_readrq);
3224 3226
3225/** 3227/**
3228 * pcie_get_mps - get PCI Express maximum payload size
3229 * @dev: PCI device to query
3230 *
3231 * Returns maximum payload size in bytes
3232 * or appropriate error value.
3233 */
3234int pcie_get_mps(struct pci_dev *dev)
3235{
3236 int ret, cap;
3237 u16 ctl;
3238
3239 cap = pci_pcie_cap(dev);
3240 if (!cap)
3241 return -EINVAL;
3242
3243 ret = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
3244 if (!ret)
3245 ret = 128 << ((ctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5);
3246
3247 return ret;
3248}
3249
3250/**
3251 * pcie_set_mps - set PCI Express maximum payload size
3252 * @dev: PCI device to query
3253 * @rq: maximum payload size in bytes
3254 * valid values are 128, 256, 512, 1024, 2048, 4096
3255 *
3256 * If possible sets maximum payload size
3257 */
3258int pcie_set_mps(struct pci_dev *dev, int mps)
3259{
3260 int cap, err = -EINVAL;
3261 u16 ctl, v;
3262
3263 if (mps < 128 || mps > 4096 || !is_power_of_2(mps))
3264 goto out;
3265
3266 v = ffs(mps) - 8;
3267 if (v > dev->pcie_mpss)
3268 goto out;
3269 v <<= 5;
3270
3271 cap = pci_pcie_cap(dev);
3272 if (!cap)
3273 goto out;
3274
3275 err = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
3276 if (err)
3277 goto out;
3278
3279 if ((ctl & PCI_EXP_DEVCTL_PAYLOAD) != v) {
3280 ctl &= ~PCI_EXP_DEVCTL_PAYLOAD;
3281 ctl |= v;
3282 err = pci_write_config_word(dev, cap + PCI_EXP_DEVCTL, ctl);
3283 }
3284out:
3285 return err;
3286}
3287
3288/**
3226 * pci_select_bars - Make BAR mask from the type of resource 3289 * pci_select_bars - Make BAR mask from the type of resource
3227 * @dev: the PCI device for which BAR mask is made 3290 * @dev: the PCI device for which BAR mask is made
3228 * @flags: resource type mask to be selected 3291 * @flags: resource type mask to be selected
@@ -3505,6 +3568,10 @@ static int __init pci_setup(char *str)
3505 pci_hotplug_io_size = memparse(str + 9, &str); 3568 pci_hotplug_io_size = memparse(str + 9, &str);
3506 } else if (!strncmp(str, "hpmemsize=", 10)) { 3569 } else if (!strncmp(str, "hpmemsize=", 10)) {
3507 pci_hotplug_mem_size = memparse(str + 10, &str); 3570 pci_hotplug_mem_size = memparse(str + 10, &str);
3571 } else if (!strncmp(str, "pcie_bus_safe", 13)) {
3572 pcie_bus_config = PCIE_BUS_SAFE;
3573 } else if (!strncmp(str, "pcie_bus_perf", 13)) {
3574 pcie_bus_config = PCIE_BUS_PERFORMANCE;
3508 } else { 3575 } else {
3509 printk(KERN_ERR "PCI: Unknown option `%s'\n", 3576 printk(KERN_ERR "PCI: Unknown option `%s'\n",
3510 str); 3577 str);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 795c9026d55f..5becf7cd50d8 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -856,6 +856,8 @@ void set_pcie_port_type(struct pci_dev *pdev)
856 pdev->pcie_cap = pos; 856 pdev->pcie_cap = pos;
857 pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &reg16); 857 pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &reg16);
858 pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4; 858 pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4;
859 pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, &reg16);
860 pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD;
859} 861}
860 862
861void set_pcie_hotplug_bridge(struct pci_dev *pdev) 863void set_pcie_hotplug_bridge(struct pci_dev *pdev)
@@ -1326,6 +1328,149 @@ int pci_scan_slot(struct pci_bus *bus, int devfn)
1326 return nr; 1328 return nr;
1327} 1329}
1328 1330
1331static int pcie_find_smpss(struct pci_dev *dev, void *data)
1332{
1333 u8 *smpss = data;
1334
1335 if (!pci_is_pcie(dev))
1336 return 0;
1337
1338 /* For PCIE hotplug enabled slots not connected directly to a
1339 * PCI-E root port, there can be problems when hotplugging
1340 * devices. This is due to the possibility of hotplugging a
1341 * device into the fabric with a smaller MPS that the devices
1342 * currently running have configured. Modifying the MPS on the
1343 * running devices could cause a fatal bus error due to an
1344 * incoming frame being larger than the newly configured MPS.
1345 * To work around this, the MPS for the entire fabric must be
1346 * set to the minimum size. Any devices hotplugged into this
1347 * fabric will have the minimum MPS set. If the PCI hotplug
1348 * slot is directly connected to the root port and there are not
1349 * other devices on the fabric (which seems to be the most
1350 * common case), then this is not an issue and MPS discovery
1351 * will occur as normal.
1352 */
1353 if (dev->is_hotplug_bridge && (!list_is_singular(&dev->bus->devices) ||
1354 dev->bus->self->pcie_type != PCI_EXP_TYPE_ROOT_PORT))
1355 *smpss = 0;
1356
1357 if (*smpss > dev->pcie_mpss)
1358 *smpss = dev->pcie_mpss;
1359
1360 return 0;
1361}
1362
1363static void pcie_write_mps(struct pci_dev *dev, int mps)
1364{
1365 int rc, dev_mpss;
1366
1367 dev_mpss = 128 << dev->pcie_mpss;
1368
1369 if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
1370 if (dev->bus->self) {
1371 dev_dbg(&dev->bus->dev, "Bus MPSS %d\n",
1372 128 << dev->bus->self->pcie_mpss);
1373
1374 /* For "MPS Force Max", the assumption is made that
1375 * downstream communication will never be larger than
1376 * the MRRS. So, the MPS only needs to be configured
1377 * for the upstream communication. This being the case,
1378 * walk from the top down and set the MPS of the child
1379 * to that of the parent bus.
1380 */
1381 mps = 128 << dev->bus->self->pcie_mpss;
1382 if (mps > dev_mpss)
1383 dev_warn(&dev->dev, "MPS configured higher than"
1384 " maximum supported by the device. If"
1385 " a bus issue occurs, try running with"
1386 " pci=pcie_bus_safe.\n");
1387 }
1388
1389 dev->pcie_mpss = ffs(mps) - 8;
1390 }
1391
1392 rc = pcie_set_mps(dev, mps);
1393 if (rc)
1394 dev_err(&dev->dev, "Failed attempting to set the MPS\n");
1395}
1396
1397static void pcie_write_mrrs(struct pci_dev *dev, int mps)
1398{
1399 int rc, mrrs;
1400
1401 if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
1402 int dev_mpss = 128 << dev->pcie_mpss;
1403
1404 /* For Max performance, the MRRS must be set to the largest
1405 * supported value. However, it cannot be configured larger
1406 * than the MPS the device or the bus can support. This assumes
1407 * that the largest MRRS available on the device cannot be
1408 * smaller than the device MPSS.
1409 */
1410 mrrs = mps < dev_mpss ? mps : dev_mpss;
1411 } else
1412 /* In the "safe" case, configure the MRRS for fairness on the
1413 * bus by making all devices have the same size
1414 */
1415 mrrs = mps;
1416
1417
1418 /* MRRS is a R/W register. Invalid values can be written, but a
1419 * subsiquent read will verify if the value is acceptable or not.
1420 * If the MRRS value provided is not acceptable (e.g., too large),
1421 * shrink the value until it is acceptable to the HW.
1422 */
1423 while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) {
1424 rc = pcie_set_readrq(dev, mrrs);
1425 if (rc)
1426 dev_err(&dev->dev, "Failed attempting to set the MRRS\n");
1427
1428 mrrs /= 2;
1429 }
1430}
1431
1432static int pcie_bus_configure_set(struct pci_dev *dev, void *data)
1433{
1434 int mps = 128 << *(u8 *)data;
1435
1436 if (!pci_is_pcie(dev))
1437 return 0;
1438
1439 dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
1440 pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
1441
1442 pcie_write_mps(dev, mps);
1443 pcie_write_mrrs(dev, mps);
1444
1445 dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
1446 pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
1447
1448 return 0;
1449}
1450
1451/* pcie_bus_configure_mps requires that pci_walk_bus work in a top-down,
1452 * parents then children fashion. If this changes, then this code will not
1453 * work as designed.
1454 */
1455void pcie_bus_configure_settings(struct pci_bus *bus, u8 mpss)
1456{
1457 u8 smpss = mpss;
1458
1459 if (!bus->self)
1460 return;
1461
1462 if (!pci_is_pcie(bus->self))
1463 return;
1464
1465 if (pcie_bus_config == PCIE_BUS_SAFE) {
1466 pcie_find_smpss(bus->self, &smpss);
1467 pci_walk_bus(bus, pcie_find_smpss, &smpss);
1468 }
1469
1470 pcie_bus_configure_set(bus->self, &smpss);
1471 pci_walk_bus(bus, pcie_bus_configure_set, &smpss);
1472}
1473
1329unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus) 1474unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus)
1330{ 1475{
1331 unsigned int devfn, pass, max = bus->secondary; 1476 unsigned int devfn, pass, max = bus->secondary;