aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw/qib/qib_file_ops.c
diff options
context:
space:
mode:
authorRamkrishna Vepa <ramkrishna.vepa@intel.com>2013-06-02 15:16:11 -0400
committerRoland Dreier <roland@purestorage.com>2013-06-21 20:19:49 -0400
commitc804f07248895ff9c9dccb6cda703068a0657b6c (patch)
tree9e7db0765d120ddff741db3cf9a4c90f849f1c23 /drivers/infiniband/hw/qib/qib_file_ops.c
parente0f30baca1ebe5547f6760f760b8c4e189fc1203 (diff)
IB/qib: Add dual-rail NUMA awareness for PSM processes
The driver currently selects a HCA based on the algorithm that PSM chooses, contexts within a HCA or across. The HCA can also be chosen by the user. Either way, this patch assigns a CPU on the NUMA node local to the selected HCA. This patch also tries to select the HCA closest to the NUMA node of the CPU assigned via taskset to PSM process. If this HCA is unusable then another unit is selected based on the algorithm that is currently enforced or selected by PSM - round robin context selection 'within' or 'across' HCA's. Fixed a bug wherein contexts are setup on the NUMA node on which the processes are opened (setup_ctxt()) and not on the NUMA node that the driver recommends the CPU on. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Vinit Agnihotri <vinit.abhay.agnihotri@intel.com> Signed-off-by: Ramkrishna Vepa <ramkrishna.vepa@intel.com> Signed-off-by: Roland Dreier <roland@purestorage.com>
Diffstat (limited to 'drivers/infiniband/hw/qib/qib_file_ops.c')
-rw-r--r--drivers/infiniband/hw/qib/qib_file_ops.c174
1 files changed, 125 insertions, 49 deletions
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 65b2fc3f957c..df3808a38381 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2012 Intel Corporation. All rights reserved. 2 * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved.
3 * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. 3 * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
4 * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. 4 * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
5 * 5 *
@@ -1155,6 +1155,49 @@ static unsigned int qib_poll(struct file *fp, struct poll_table_struct *pt)
1155 return pollflag; 1155 return pollflag;
1156} 1156}
1157 1157
1158static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
1159{
1160 struct qib_filedata *fd = fp->private_data;
1161 const unsigned int weight = cpumask_weight(&current->cpus_allowed);
1162 const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
1163 int local_cpu;
1164
1165 /*
1166 * If process has NOT already set it's affinity, select and
1167 * reserve a processor for it on the local NUMA node.
1168 */
1169 if ((weight >= qib_cpulist_count) &&
1170 (cpumask_weight(local_mask) <= qib_cpulist_count)) {
1171 for_each_cpu(local_cpu, local_mask)
1172 if (!test_and_set_bit(local_cpu, qib_cpulist)) {
1173 fd->rec_cpu_num = local_cpu;
1174 return;
1175 }
1176 }
1177
1178 /*
1179 * If process has NOT already set it's affinity, select and
1180 * reserve a processor for it, as a rendevous for all
1181 * users of the driver. If they don't actually later
1182 * set affinity to this cpu, or set it to some other cpu,
1183 * it just means that sooner or later we don't recommend
1184 * a cpu, and let the scheduler do it's best.
1185 */
1186 if (weight >= qib_cpulist_count) {
1187 int cpu;
1188 cpu = find_first_zero_bit(qib_cpulist,
1189 qib_cpulist_count);
1190 if (cpu == qib_cpulist_count)
1191 qib_dev_err(dd,
1192 "no cpus avail for affinity PID %u\n",
1193 current->pid);
1194 else {
1195 __set_bit(cpu, qib_cpulist);
1196 fd->rec_cpu_num = cpu;
1197 }
1198 }
1199}
1200
1158/* 1201/*
1159 * Check that userland and driver are compatible for subcontexts. 1202 * Check that userland and driver are compatible for subcontexts.
1160 */ 1203 */
@@ -1259,14 +1302,18 @@ bail:
1259static int setup_ctxt(struct qib_pportdata *ppd, int ctxt, 1302static int setup_ctxt(struct qib_pportdata *ppd, int ctxt,
1260 struct file *fp, const struct qib_user_info *uinfo) 1303 struct file *fp, const struct qib_user_info *uinfo)
1261{ 1304{
1305 struct qib_filedata *fd = fp->private_data;
1262 struct qib_devdata *dd = ppd->dd; 1306 struct qib_devdata *dd = ppd->dd;
1263 struct qib_ctxtdata *rcd; 1307 struct qib_ctxtdata *rcd;
1264 void *ptmp = NULL; 1308 void *ptmp = NULL;
1265 int ret; 1309 int ret;
1266 int numa_id; 1310 int numa_id;
1267 1311
1268 numa_id = qib_numa_aware ? numa_node_id() : 1312 assign_ctxt_affinity(fp, dd);
1269 dd->assigned_node_id; 1313
1314 numa_id = qib_numa_aware ? ((fd->rec_cpu_num != -1) ?
1315 cpu_to_node(fd->rec_cpu_num) :
1316 numa_node_id()) : dd->assigned_node_id;
1270 1317
1271 rcd = qib_create_ctxtdata(ppd, ctxt, numa_id); 1318 rcd = qib_create_ctxtdata(ppd, ctxt, numa_id);
1272 1319
@@ -1300,6 +1347,9 @@ static int setup_ctxt(struct qib_pportdata *ppd, int ctxt,
1300 goto bail; 1347 goto bail;
1301 1348
1302bailerr: 1349bailerr:
1350 if (fd->rec_cpu_num != -1)
1351 __clear_bit(fd->rec_cpu_num, qib_cpulist);
1352
1303 dd->rcd[ctxt] = NULL; 1353 dd->rcd[ctxt] = NULL;
1304 kfree(rcd); 1354 kfree(rcd);
1305 kfree(ptmp); 1355 kfree(ptmp);
@@ -1489,6 +1539,57 @@ static int qib_open(struct inode *in, struct file *fp)
1489 return fp->private_data ? 0 : -ENOMEM; 1539 return fp->private_data ? 0 : -ENOMEM;
1490} 1540}
1491 1541
1542static int find_hca(unsigned int cpu, int *unit)
1543{
1544 int ret = 0, devmax, npresent, nup, ndev;
1545
1546 *unit = -1;
1547
1548 devmax = qib_count_units(&npresent, &nup);
1549 if (!npresent) {
1550 ret = -ENXIO;
1551 goto done;
1552 }
1553 if (!nup) {
1554 ret = -ENETDOWN;
1555 goto done;
1556 }
1557 for (ndev = 0; ndev < devmax; ndev++) {
1558 struct qib_devdata *dd = qib_lookup(ndev);
1559 if (dd) {
1560 if (pcibus_to_node(dd->pcidev->bus) < 0) {
1561 ret = -EINVAL;
1562 goto done;
1563 }
1564 if (cpu_to_node(cpu) ==
1565 pcibus_to_node(dd->pcidev->bus)) {
1566 *unit = ndev;
1567 goto done;
1568 }
1569 }
1570 }
1571done:
1572 return ret;
1573}
1574
1575static int do_qib_user_sdma_queue_create(struct file *fp)
1576{
1577 struct qib_filedata *fd = fp->private_data;
1578 struct qib_ctxtdata *rcd = fd->rcd;
1579 struct qib_devdata *dd = rcd->dd;
1580
1581 if (dd->flags & QIB_HAS_SEND_DMA)
1582
1583 fd->pq = qib_user_sdma_queue_create(&dd->pcidev->dev,
1584 dd->unit,
1585 rcd->ctxt,
1586 fd->subctxt);
1587 if (!fd->pq)
1588 return -ENOMEM;
1589
1590 return 0;
1591}
1592
1492/* 1593/*
1493 * Get ctxt early, so can set affinity prior to memory allocation. 1594 * Get ctxt early, so can set affinity prior to memory allocation.
1494 */ 1595 */
@@ -1521,61 +1622,36 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo)
1521 if (qib_compatible_subctxts(swmajor, swminor) && 1622 if (qib_compatible_subctxts(swmajor, swminor) &&
1522 uinfo->spu_subctxt_cnt) { 1623 uinfo->spu_subctxt_cnt) {
1523 ret = find_shared_ctxt(fp, uinfo); 1624 ret = find_shared_ctxt(fp, uinfo);
1524 if (ret) { 1625 if (ret > 0) {
1525 if (ret > 0) 1626 ret = do_qib_user_sdma_queue_create(fp);
1526 ret = 0; 1627 if (!ret)
1527 goto done_chk_sdma; 1628 assign_ctxt_affinity(fp, (ctxt_fp(fp))->dd);
1629 goto done_ok;
1528 } 1630 }
1529 } 1631 }
1530 1632
1531 i_minor = iminor(file_inode(fp)) - QIB_USER_MINOR_BASE; 1633 i_minor = iminor(file_inode(fp)) - QIB_USER_MINOR_BASE;
1532 if (i_minor) 1634 if (i_minor)
1533 ret = find_free_ctxt(i_minor - 1, fp, uinfo); 1635 ret = find_free_ctxt(i_minor - 1, fp, uinfo);
1534 else 1636 else {
1637 int unit;
1638 const unsigned int cpu = cpumask_first(&current->cpus_allowed);
1639 const unsigned int weight =
1640 cpumask_weight(&current->cpus_allowed);
1641
1642 if (weight == 1 && !test_bit(cpu, qib_cpulist))
1643 if (!find_hca(cpu, &unit) && unit >= 0)
1644 if (!find_free_ctxt(unit, fp, uinfo)) {
1645 ret = 0;
1646 goto done_chk_sdma;
1647 }
1535 ret = get_a_ctxt(fp, uinfo, alg); 1648 ret = get_a_ctxt(fp, uinfo, alg);
1536
1537done_chk_sdma:
1538 if (!ret) {
1539 struct qib_filedata *fd = fp->private_data;
1540 const struct qib_ctxtdata *rcd = fd->rcd;
1541 const struct qib_devdata *dd = rcd->dd;
1542 unsigned int weight;
1543
1544 if (dd->flags & QIB_HAS_SEND_DMA) {
1545 fd->pq = qib_user_sdma_queue_create(&dd->pcidev->dev,
1546 dd->unit,
1547 rcd->ctxt,
1548 fd->subctxt);
1549 if (!fd->pq)
1550 ret = -ENOMEM;
1551 }
1552
1553 /*
1554 * If process has NOT already set it's affinity, select and
1555 * reserve a processor for it, as a rendezvous for all
1556 * users of the driver. If they don't actually later
1557 * set affinity to this cpu, or set it to some other cpu,
1558 * it just means that sooner or later we don't recommend
1559 * a cpu, and let the scheduler do it's best.
1560 */
1561 weight = cpumask_weight(tsk_cpus_allowed(current));
1562 if (!ret && weight >= qib_cpulist_count) {
1563 int cpu;
1564 cpu = find_first_zero_bit(qib_cpulist,
1565 qib_cpulist_count);
1566 if (cpu != qib_cpulist_count) {
1567 __set_bit(cpu, qib_cpulist);
1568 fd->rec_cpu_num = cpu;
1569 }
1570 } else if (weight == 1 &&
1571 test_bit(cpumask_first(tsk_cpus_allowed(current)),
1572 qib_cpulist))
1573 qib_devinfo(dd->pcidev,
1574 "%s PID %u affinity set to cpu %d; already allocated\n",
1575 current->comm, current->pid,
1576 cpumask_first(tsk_cpus_allowed(current)));
1577 } 1649 }
1578 1650
1651done_chk_sdma:
1652 if (!ret)
1653 ret = do_qib_user_sdma_queue_create(fp);
1654done_ok:
1579 mutex_unlock(&qib_mutex); 1655 mutex_unlock(&qib_mutex);
1580 1656
1581done: 1657done: