aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMitko Haralanov <mitko.haralanov@intel.com>2016-02-05 11:57:58 -0500
committerDoug Ledford <dledford@redhat.com>2016-02-29 17:10:39 -0500
commit0b091fb32c5ae4737bf606a313e6625dad34bbc6 (patch)
tree27874d46baa52eb329a4db0e1b7e48dbcd20bce0
parent7e7a436ecb6e703a232df0613b5f24accbe3d7d2 (diff)
staging/hfi1: Enable TID caching feature
This commit "flips the switch" on the TID caching feature implemented in this patch series. As well as enabling the new feature by tying the new function with the PSM API, it also cleans up the old unneeded code, data structure members, and variables. Due to difference in operation and information, the tracing functions related to expected receives had to be changed. This patch include these changes. The tracing function changes could not be split into a separate commit without including both tracing variants at the same time. This would have caused other complications and ugliness. Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com> Reviewed-by: Ira Weiny <ira.weiny@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
-rw-r--r--drivers/staging/rdma/hfi1/file_ops.c448
-rw-r--r--drivers/staging/rdma/hfi1/hfi.h14
-rw-r--r--drivers/staging/rdma/hfi1/init.c3
-rw-r--r--drivers/staging/rdma/hfi1/trace.h132
-rw-r--r--drivers/staging/rdma/hfi1/user_exp_rcv.c12
-rw-r--r--drivers/staging/rdma/hfi1/user_pages.c14
-rw-r--r--include/uapi/rdma/hfi/hfi1_user.h7
7 files changed, 132 insertions, 498 deletions
diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c
index b0348263b901..d36588934f99 100644
--- a/drivers/staging/rdma/hfi1/file_ops.c
+++ b/drivers/staging/rdma/hfi1/file_ops.c
@@ -96,9 +96,6 @@ static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long);
96static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16); 96static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
97static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int); 97static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
98static int vma_fault(struct vm_area_struct *, struct vm_fault *); 98static int vma_fault(struct vm_area_struct *, struct vm_fault *);
99static int exp_tid_setup(struct file *, struct hfi1_tid_info *);
100static int exp_tid_free(struct file *, struct hfi1_tid_info *);
101static void unlock_exp_tids(struct hfi1_ctxtdata *);
102 99
103static const struct file_operations hfi1_file_ops = { 100static const struct file_operations hfi1_file_ops = {
104 .owner = THIS_MODULE, 101 .owner = THIS_MODULE,
@@ -188,6 +185,7 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
188 struct hfi1_cmd cmd; 185 struct hfi1_cmd cmd;
189 struct hfi1_user_info uinfo; 186 struct hfi1_user_info uinfo;
190 struct hfi1_tid_info tinfo; 187 struct hfi1_tid_info tinfo;
188 unsigned long addr;
191 ssize_t consumed = 0, copy = 0, ret = 0; 189 ssize_t consumed = 0, copy = 0, ret = 0;
192 void *dest = NULL; 190 void *dest = NULL;
193 __u64 user_val = 0; 191 __u64 user_val = 0;
@@ -219,6 +217,7 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
219 break; 217 break;
220 case HFI1_CMD_TID_UPDATE: 218 case HFI1_CMD_TID_UPDATE:
221 case HFI1_CMD_TID_FREE: 219 case HFI1_CMD_TID_FREE:
220 case HFI1_CMD_TID_INVAL_READ:
222 copy = sizeof(tinfo); 221 copy = sizeof(tinfo);
223 dest = &tinfo; 222 dest = &tinfo;
224 break; 223 break;
@@ -241,7 +240,6 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
241 must_be_root = 1; /* validate user */ 240 must_be_root = 1; /* validate user */
242 copy = 0; 241 copy = 0;
243 break; 242 break;
244 case HFI1_CMD_TID_INVAL_READ:
245 default: 243 default:
246 ret = -EINVAL; 244 ret = -EINVAL;
247 goto bail; 245 goto bail;
@@ -295,9 +293,8 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
295 sc_return_credits(uctxt->sc); 293 sc_return_credits(uctxt->sc);
296 break; 294 break;
297 case HFI1_CMD_TID_UPDATE: 295 case HFI1_CMD_TID_UPDATE:
298 ret = exp_tid_setup(fp, &tinfo); 296 ret = hfi1_user_exp_rcv_setup(fp, &tinfo);
299 if (!ret) { 297 if (!ret) {
300 unsigned long addr;
301 /* 298 /*
302 * Copy the number of tidlist entries we used 299 * Copy the number of tidlist entries we used
303 * and the length of the buffer we registered. 300 * and the length of the buffer we registered.
@@ -312,8 +309,25 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
312 ret = -EFAULT; 309 ret = -EFAULT;
313 } 310 }
314 break; 311 break;
312 case HFI1_CMD_TID_INVAL_READ:
313 ret = hfi1_user_exp_rcv_invalid(fp, &tinfo);
314 if (ret)
315 break;
316 addr = (unsigned long)cmd.addr +
317 offsetof(struct hfi1_tid_info, tidcnt);
318 if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
319 sizeof(tinfo.tidcnt)))
320 ret = -EFAULT;
321 break;
315 case HFI1_CMD_TID_FREE: 322 case HFI1_CMD_TID_FREE:
316 ret = exp_tid_free(fp, &tinfo); 323 ret = hfi1_user_exp_rcv_clear(fp, &tinfo);
324 if (ret)
325 break;
326 addr = (unsigned long)cmd.addr +
327 offsetof(struct hfi1_tid_info, tidcnt);
328 if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
329 sizeof(tinfo.tidcnt)))
330 ret = -EFAULT;
317 break; 331 break;
318 case HFI1_CMD_RECV_CTRL: 332 case HFI1_CMD_RECV_CTRL:
319 ret = manage_rcvq(uctxt, fd->subctxt, (int)user_val); 333 ret = manage_rcvq(uctxt, fd->subctxt, (int)user_val);
@@ -779,12 +793,9 @@ static int hfi1_file_close(struct inode *inode, struct file *fp)
779 uctxt->pionowait = 0; 793 uctxt->pionowait = 0;
780 uctxt->event_flags = 0; 794 uctxt->event_flags = 0;
781 795
782 hfi1_clear_tids(uctxt); 796 hfi1_user_exp_rcv_free(fdata);
783 hfi1_clear_ctxt_pkey(dd, uctxt->ctxt); 797 hfi1_clear_ctxt_pkey(dd, uctxt->ctxt);
784 798
785 if (uctxt->tid_pg_list)
786 unlock_exp_tids(uctxt);
787
788 hfi1_stats.sps_ctxts--; 799 hfi1_stats.sps_ctxts--;
789 dd->freectxts++; 800 dd->freectxts++;
790 mutex_unlock(&hfi1_mutex); 801 mutex_unlock(&hfi1_mutex);
@@ -1107,7 +1118,7 @@ static int user_init(struct file *fp)
1107 ret = wait_event_interruptible(uctxt->wait, 1118 ret = wait_event_interruptible(uctxt->wait,
1108 !test_bit(HFI1_CTXT_MASTER_UNINIT, 1119 !test_bit(HFI1_CTXT_MASTER_UNINIT,
1109 &uctxt->event_flags)); 1120 &uctxt->event_flags));
1110 goto done; 1121 goto expected;
1111 } 1122 }
1112 1123
1113 /* initialize poll variables... */ 1124 /* initialize poll variables... */
@@ -1154,8 +1165,18 @@ static int user_init(struct file *fp)
1154 clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags); 1165 clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
1155 wake_up(&uctxt->wait); 1166 wake_up(&uctxt->wait);
1156 } 1167 }
1157 ret = 0;
1158 1168
1169expected:
1170 /*
1171 * Expected receive has to be setup for all processes (including
1172 * shared contexts). However, it has to be done after the master
1173 * context has been fully configured as it depends on the
1174 * eager/expected split of the RcvArray entries.
1175 * Setting it up here ensures that the subcontexts will be waiting
1176 * (due to the above wait_event_interruptible() until the master
1177 * is setup.
1178 */
1179 ret = hfi1_user_exp_rcv_init(fp);
1159done: 1180done:
1160 return ret; 1181 return ret;
1161} 1182}
@@ -1225,46 +1246,6 @@ static int setup_ctxt(struct file *fp)
1225 if (ret) 1246 if (ret)
1226 goto done; 1247 goto done;
1227 } 1248 }
1228 /* Setup Expected Rcv memories */
1229 uctxt->tid_pg_list = vzalloc(uctxt->expected_count *
1230 sizeof(struct page **));
1231 if (!uctxt->tid_pg_list) {
1232 ret = -ENOMEM;
1233 goto done;
1234 }
1235 uctxt->physshadow = vzalloc(uctxt->expected_count *
1236 sizeof(*uctxt->physshadow));
1237 if (!uctxt->physshadow) {
1238 ret = -ENOMEM;
1239 goto done;
1240 }
1241 /* allocate expected TID map and initialize the cursor */
1242 atomic_set(&uctxt->tidcursor, 0);
1243 uctxt->numtidgroups = uctxt->expected_count /
1244 dd->rcv_entries.group_size;
1245 uctxt->tidmapcnt = uctxt->numtidgroups / BITS_PER_LONG +
1246 !!(uctxt->numtidgroups % BITS_PER_LONG);
1247 uctxt->tidusemap = kzalloc_node(uctxt->tidmapcnt *
1248 sizeof(*uctxt->tidusemap),
1249 GFP_KERNEL, uctxt->numa_id);
1250 if (!uctxt->tidusemap) {
1251 ret = -ENOMEM;
1252 goto done;
1253 }
1254 /*
1255 * In case that the number of groups is not a multiple of
1256 * 64 (the number of groups in a tidusemap element), mark
1257 * the extra ones as used. This will effectively make them
1258 * permanently used and should never be assigned. Otherwise,
1259 * the code which checks how many free groups we have will
1260 * get completely confused about the state of the bits.
1261 */
1262 if (uctxt->numtidgroups % BITS_PER_LONG)
1263 uctxt->tidusemap[uctxt->tidmapcnt - 1] =
1264 ~((1ULL << (uctxt->numtidgroups %
1265 BITS_PER_LONG)) - 1);
1266 trace_hfi1_exp_tid_map(uctxt->ctxt, fd->subctxt, 0,
1267 uctxt->tidusemap, uctxt->tidmapcnt);
1268 } 1249 }
1269 ret = hfi1_user_sdma_alloc_queues(uctxt, fp); 1250 ret = hfi1_user_sdma_alloc_queues(uctxt, fp);
1270 if (ret) 1251 if (ret)
@@ -1503,367 +1484,6 @@ static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt,
1503 return 0; 1484 return 0;
1504} 1485}
1505 1486
1506#define num_user_pages(vaddr, len) \
1507 (1 + (((((unsigned long)(vaddr) + \
1508 (unsigned long)(len) - 1) & PAGE_MASK) - \
1509 ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
1510
1511/**
1512 * tzcnt - count the number of trailing zeros in a 64bit value
1513 * @value: the value to be examined
1514 *
1515 * Returns the number of trailing least significant zeros in the
1516 * the input value. If the value is zero, return the number of
1517 * bits of the value.
1518 */
1519static inline u8 tzcnt(u64 value)
1520{
1521 return value ? __builtin_ctzl(value) : sizeof(value) * 8;
1522}
1523
1524static inline unsigned num_free_groups(unsigned long map, u16 *start)
1525{
1526 unsigned free;
1527 u16 bitidx = *start;
1528
1529 if (bitidx >= BITS_PER_LONG)
1530 return 0;
1531 /* "Turn off" any bits set before our bit index */
1532 map &= ~((1ULL << bitidx) - 1);
1533 free = tzcnt(map) - bitidx;
1534 while (!free && bitidx < BITS_PER_LONG) {
1535 /* Zero out the last set bit so we look at the rest */
1536 map &= ~(1ULL << bitidx);
1537 /*
1538 * Account for the previously checked bits and advance
1539 * the bit index. We don't have to check for bitidx
1540 * getting bigger than BITS_PER_LONG here as it would
1541 * mean extra instructions that we don't need. If it
1542 * did happen, it would push free to a negative value
1543 * which will break the loop.
1544 */
1545 free = tzcnt(map) - ++bitidx;
1546 }
1547 *start = bitidx;
1548 return free;
1549}
1550
1551static int exp_tid_setup(struct file *fp, struct hfi1_tid_info *tinfo)
1552{
1553 int ret = 0;
1554 struct hfi1_filedata *fd = fp->private_data;
1555 struct hfi1_ctxtdata *uctxt = fd->uctxt;
1556 struct hfi1_devdata *dd = uctxt->dd;
1557 unsigned tid, mapped = 0, npages, ngroups, exp_groups,
1558 tidpairs = uctxt->expected_count / 2;
1559 struct page **pages;
1560 unsigned long vaddr, tidmap[uctxt->tidmapcnt];
1561 dma_addr_t *phys;
1562 u32 tidlist[tidpairs], pairidx = 0, tidcursor;
1563 u16 useidx, idx, bitidx, tidcnt = 0;
1564
1565 vaddr = tinfo->vaddr;
1566
1567 if (offset_in_page(vaddr)) {
1568 ret = -EINVAL;
1569 goto bail;
1570 }
1571
1572 npages = num_user_pages(vaddr, tinfo->length);
1573 if (!npages) {
1574 ret = -EINVAL;
1575 goto bail;
1576 }
1577 if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
1578 npages * PAGE_SIZE)) {
1579 dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
1580 (void *)vaddr, npages);
1581 ret = -EFAULT;
1582 goto bail;
1583 }
1584
1585 memset(tidmap, 0, sizeof(tidmap[0]) * uctxt->tidmapcnt);
1586 memset(tidlist, 0, sizeof(tidlist[0]) * tidpairs);
1587
1588 exp_groups = uctxt->expected_count / dd->rcv_entries.group_size;
1589 /* which group set do we look at first? */
1590 tidcursor = atomic_read(&uctxt->tidcursor);
1591 useidx = (tidcursor >> 16) & 0xffff;
1592 bitidx = tidcursor & 0xffff;
1593
1594 /*
1595 * Keep going until we've mapped all pages or we've exhausted all
1596 * RcvArray entries.
1597 * This iterates over the number of tidmaps + 1
1598 * (idx <= uctxt->tidmapcnt) so we check the bitmap which we
1599 * started from one more time for any free bits before the
1600 * starting point bit.
1601 */
1602 for (mapped = 0, idx = 0;
1603 mapped < npages && idx <= uctxt->tidmapcnt;) {
1604 u64 i, offset = 0;
1605 unsigned free, pinned, pmapped = 0, bits_used;
1606 u16 grp;
1607
1608 /*
1609 * "Reserve" the needed group bits under lock so other
1610 * processes can't step in the middle of it. Once
1611 * reserved, we don't need the lock anymore since we
1612 * are guaranteed the groups.
1613 */
1614 mutex_lock(&uctxt->exp_lock);
1615 if (uctxt->tidusemap[useidx] == -1ULL ||
1616 bitidx >= BITS_PER_LONG) {
1617 /* no free groups in the set, use the next */
1618 useidx = (useidx + 1) % uctxt->tidmapcnt;
1619 idx++;
1620 bitidx = 0;
1621 mutex_unlock(&uctxt->exp_lock);
1622 continue;
1623 }
1624 ngroups = ((npages - mapped) / dd->rcv_entries.group_size) +
1625 !!((npages - mapped) % dd->rcv_entries.group_size);
1626
1627 /*
1628 * If we've gotten here, the current set of groups does have
1629 * one or more free groups.
1630 */
1631 free = num_free_groups(uctxt->tidusemap[useidx], &bitidx);
1632 if (!free) {
1633 /*
1634 * Despite the check above, free could still come back
1635 * as 0 because we don't check the entire bitmap but
1636 * we start from bitidx.
1637 */
1638 mutex_unlock(&uctxt->exp_lock);
1639 continue;
1640 }
1641 bits_used = min(free, ngroups);
1642 tidmap[useidx] |= ((1ULL << bits_used) - 1) << bitidx;
1643 uctxt->tidusemap[useidx] |= tidmap[useidx];
1644 mutex_unlock(&uctxt->exp_lock);
1645
1646 /*
1647 * At this point, we know where in the map we have free bits.
1648 * properly offset into the various "shadow" arrays and compute
1649 * the RcvArray entry index.
1650 */
1651 offset = ((useidx * BITS_PER_LONG) + bitidx) *
1652 dd->rcv_entries.group_size;
1653 pages = uctxt->tid_pg_list + offset;
1654 phys = uctxt->physshadow + offset;
1655 tid = uctxt->expected_base + offset;
1656
1657 /* Calculate how many pages we can pin based on free bits */
1658 pinned = min((bits_used * dd->rcv_entries.group_size),
1659 (npages - mapped));
1660 /*
1661 * Now that we know how many free RcvArray entries we have,
1662 * we can pin that many user pages.
1663 */
1664 ret = hfi1_acquire_user_pages(vaddr + (mapped * PAGE_SIZE),
1665 pinned, true, pages);
1666 if (ret) {
1667 /*
1668 * We can't continue because the pages array won't be
1669 * initialized. This should never happen,
1670 * unless perhaps the user has mpin'ed the pages
1671 * themselves.
1672 */
1673 dd_dev_info(dd,
1674 "Failed to lock addr %p, %u pages: errno %d\n",
1675 (void *) vaddr, pinned, -ret);
1676 /*
1677 * Let go of the bits that we reserved since we are not
1678 * going to use them.
1679 */
1680 mutex_lock(&uctxt->exp_lock);
1681 uctxt->tidusemap[useidx] &=
1682 ~(((1ULL << bits_used) - 1) << bitidx);
1683 mutex_unlock(&uctxt->exp_lock);
1684 goto done;
1685 }
1686 /*
1687 * How many groups do we need based on how many pages we have
1688 * pinned?
1689 */
1690 ngroups = (pinned / dd->rcv_entries.group_size) +
1691 !!(pinned % dd->rcv_entries.group_size);
1692 /*
1693 * Keep programming RcvArray entries for all the <ngroups> free
1694 * groups.
1695 */
1696 for (i = 0, grp = 0; grp < ngroups; i++, grp++) {
1697 unsigned j;
1698 u32 pair_size = 0, tidsize;
1699 /*
1700 * This inner loop will program an entire group or the
1701 * array of pinned pages (which ever limit is hit
1702 * first).
1703 */
1704 for (j = 0; j < dd->rcv_entries.group_size &&
1705 pmapped < pinned; j++, pmapped++, tid++) {
1706 tidsize = PAGE_SIZE;
1707 phys[pmapped] = hfi1_map_page(dd->pcidev,
1708 pages[pmapped], 0,
1709 tidsize, PCI_DMA_FROMDEVICE);
1710 trace_hfi1_exp_rcv_set(uctxt->ctxt,
1711 fd->subctxt,
1712 tid, vaddr,
1713 phys[pmapped],
1714 pages[pmapped]);
1715 /*
1716 * Each RcvArray entry is programmed with one
1717 * page * worth of memory. This will handle
1718 * the 8K MTU as well as anything smaller
1719 * due to the fact that both entries in the
1720 * RcvTidPair are programmed with a page.
1721 * PSM currently does not handle anything
1722 * bigger than 8K MTU, so should we even worry
1723 * about 10K here?
1724 */
1725 hfi1_put_tid(dd, tid, PT_EXPECTED,
1726 phys[pmapped],
1727 ilog2(tidsize >> PAGE_SHIFT) + 1);
1728 pair_size += tidsize >> PAGE_SHIFT;
1729 EXP_TID_RESET(tidlist[pairidx], LEN, pair_size);
1730 if (!(tid % 2)) {
1731 tidlist[pairidx] |=
1732 EXP_TID_SET(IDX,
1733 (tid - uctxt->expected_base)
1734 / 2);
1735 tidlist[pairidx] |=
1736 EXP_TID_SET(CTRL, 1);
1737 tidcnt++;
1738 } else {
1739 tidlist[pairidx] |=
1740 EXP_TID_SET(CTRL, 2);
1741 pair_size = 0;
1742 pairidx++;
1743 }
1744 }
1745 /*
1746 * We've programmed the entire group (or as much of the
1747 * group as we'll use. Now, it's time to push it out...
1748 */
1749 flush_wc();
1750 }
1751 mapped += pinned;
1752 atomic_set(&uctxt->tidcursor,
1753 (((useidx & 0xffffff) << 16) |
1754 ((bitidx + bits_used) & 0xffffff)));
1755 }
1756 trace_hfi1_exp_tid_map(uctxt->ctxt, fd->subctxt, 0, uctxt->tidusemap,
1757 uctxt->tidmapcnt);
1758
1759done:
1760 /* If we've mapped anything, copy relevant info to user */
1761 if (mapped) {
1762 if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
1763 tidlist, sizeof(tidlist[0]) * tidcnt)) {
1764 ret = -EFAULT;
1765 goto done;
1766 }
1767 /* copy TID info to user */
1768 if (copy_to_user((void __user *)(unsigned long)tinfo->tidmap,
1769 tidmap, sizeof(tidmap[0]) * uctxt->tidmapcnt))
1770 ret = -EFAULT;
1771 }
1772bail:
1773 /*
1774 * Calculate mapped length. New Exp TID protocol does not "unwind" and
1775 * report an error if it can't map the entire buffer. It just reports
1776 * the length that was mapped.
1777 */
1778 tinfo->length = mapped * PAGE_SIZE;
1779 tinfo->tidcnt = tidcnt;
1780 return ret;
1781}
1782
1783static int exp_tid_free(struct file *fp, struct hfi1_tid_info *tinfo)
1784{
1785 struct hfi1_filedata *fd = fp->private_data;
1786 struct hfi1_ctxtdata *uctxt = fd->uctxt;
1787 struct hfi1_devdata *dd = uctxt->dd;
1788 unsigned long tidmap[uctxt->tidmapcnt];
1789 struct page **pages;
1790 dma_addr_t *phys;
1791 u16 idx, bitidx, tid;
1792 int ret = 0;
1793
1794 if (copy_from_user(&tidmap, (void __user *)(unsigned long)
1795 tinfo->tidmap,
1796 sizeof(tidmap[0]) * uctxt->tidmapcnt)) {
1797 ret = -EFAULT;
1798 goto done;
1799 }
1800 for (idx = 0; idx < uctxt->tidmapcnt; idx++) {
1801 unsigned long map;
1802
1803 bitidx = 0;
1804 if (!tidmap[idx])
1805 continue;
1806 map = tidmap[idx];
1807 while ((bitidx = tzcnt(map)) < BITS_PER_LONG) {
1808 int i, pcount = 0;
1809 struct page *pshadow[dd->rcv_entries.group_size];
1810 unsigned offset = ((idx * BITS_PER_LONG) + bitidx) *
1811 dd->rcv_entries.group_size;
1812
1813 pages = uctxt->tid_pg_list + offset;
1814 phys = uctxt->physshadow + offset;
1815 tid = uctxt->expected_base + offset;
1816 for (i = 0; i < dd->rcv_entries.group_size;
1817 i++, tid++) {
1818 if (pages[i]) {
1819 hfi1_put_tid(dd, tid, PT_INVALID,
1820 0, 0);
1821 trace_hfi1_exp_rcv_free(uctxt->ctxt,
1822 fd->subctxt,
1823 tid, phys[i],
1824 pages[i]);
1825 pci_unmap_page(dd->pcidev, phys[i],
1826 PAGE_SIZE, PCI_DMA_FROMDEVICE);
1827 pshadow[pcount] = pages[i];
1828 pages[i] = NULL;
1829 pcount++;
1830 phys[i] = 0;
1831 }
1832 }
1833 flush_wc();
1834 hfi1_release_user_pages(pshadow, pcount, true);
1835 clear_bit(bitidx, &uctxt->tidusemap[idx]);
1836 map &= ~(1ULL<<bitidx);
1837 }
1838 }
1839 trace_hfi1_exp_tid_map(uctxt->ctxt, fd->subctxt, 1, uctxt->tidusemap,
1840 uctxt->tidmapcnt);
1841done:
1842 return ret;
1843}
1844
1845static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt)
1846{
1847 struct hfi1_devdata *dd = uctxt->dd;
1848 unsigned tid;
1849
1850 dd_dev_info(dd, "ctxt %u unlocking any locked expTID pages\n",
1851 uctxt->ctxt);
1852 for (tid = 0; tid < uctxt->expected_count; tid++) {
1853 struct page *p = uctxt->tid_pg_list[tid];
1854 dma_addr_t phys;
1855
1856 if (!p)
1857 continue;
1858
1859 phys = uctxt->physshadow[tid];
1860 uctxt->physshadow[tid] = 0;
1861 uctxt->tid_pg_list[tid] = NULL;
1862 pci_unmap_page(dd->pcidev, phys, PAGE_SIZE, PCI_DMA_FROMDEVICE);
1863 hfi1_release_user_pages(&p, 1, true);
1864 }
1865}
1866
1867static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt, 1487static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
1868 u16 pkey) 1488 u16 pkey)
1869{ 1489{
diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h
index 53f464cc40ef..62157cc34727 100644
--- a/drivers/staging/rdma/hfi1/hfi.h
+++ b/drivers/staging/rdma/hfi1/hfi.h
@@ -240,18 +240,6 @@ struct hfi1_ctxtdata {
240 u32 expected_count; 240 u32 expected_count;
241 /* index of first expected TID entry. */ 241 /* index of first expected TID entry. */
242 u32 expected_base; 242 u32 expected_base;
243 /* cursor into the exp group sets */
244 atomic_t tidcursor;
245 /* number of exp TID groups assigned to the ctxt */
246 u16 numtidgroups;
247 /* size of exp TID group fields in tidusemap */
248 u16 tidmapcnt;
249 /* exp TID group usage bitfield array */
250 unsigned long *tidusemap;
251 /* pinned pages for exp sends, allocated at open */
252 struct page **tid_pg_list;
253 /* dma handles for exp tid pages */
254 dma_addr_t *physshadow;
255 243
256 struct exp_tid_set tid_group_list; 244 struct exp_tid_set tid_group_list;
257 struct exp_tid_set tid_used_list; 245 struct exp_tid_set tid_used_list;
@@ -1660,8 +1648,6 @@ int get_platform_config_field(struct hfi1_devdata *dd,
1660 enum platform_config_table_type_encoding table_type, 1648 enum platform_config_table_type_encoding table_type,
1661 int table_index, int field_index, u32 *data, u32 len); 1649 int table_index, int field_index, u32 *data, u32 len);
1662 1650
1663dma_addr_t hfi1_map_page(struct pci_dev *, struct page *, unsigned long,
1664 size_t, int);
1665const char *get_unit_name(int unit); 1651const char *get_unit_name(int unit);
1666 1652
1667/* 1653/*
diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c
index 72c51431b2bf..00f52e815242 100644
--- a/drivers/staging/rdma/hfi1/init.c
+++ b/drivers/staging/rdma/hfi1/init.c
@@ -962,13 +962,10 @@ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
962 kfree(rcd->egrbufs.buffers); 962 kfree(rcd->egrbufs.buffers);
963 963
964 sc_free(rcd->sc); 964 sc_free(rcd->sc);
965 vfree(rcd->physshadow);
966 vfree(rcd->tid_pg_list);
967 vfree(rcd->user_event_mask); 965 vfree(rcd->user_event_mask);
968 vfree(rcd->subctxt_uregbase); 966 vfree(rcd->subctxt_uregbase);
969 vfree(rcd->subctxt_rcvegrbuf); 967 vfree(rcd->subctxt_rcvegrbuf);
970 vfree(rcd->subctxt_rcvhdr_base); 968 vfree(rcd->subctxt_rcvhdr_base);
971 kfree(rcd->tidusemap);
972 kfree(rcd->opstats); 969 kfree(rcd->opstats);
973 kfree(rcd); 970 kfree(rcd);
974} 971}
diff --git a/drivers/staging/rdma/hfi1/trace.h b/drivers/staging/rdma/hfi1/trace.h
index 86c12ebfd4f0..1e435675335f 100644
--- a/drivers/staging/rdma/hfi1/trace.h
+++ b/drivers/staging/rdma/hfi1/trace.h
@@ -153,92 +153,130 @@ TRACE_EVENT(hfi1_receive_interrupt,
153 ) 153 )
154); 154);
155 155
156const char *print_u64_array(struct trace_seq *, u64 *, int); 156TRACE_EVENT(hfi1_exp_tid_reg,
157 TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr,
158 u32 npages, unsigned long va, unsigned long pa,
159 dma_addr_t dma),
160 TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
161 TP_STRUCT__entry(
162 __field(unsigned, ctxt)
163 __field(u16, subctxt)
164 __field(u32, rarr)
165 __field(u32, npages)
166 __field(unsigned long, va)
167 __field(unsigned long, pa)
168 __field(dma_addr_t, dma)
169 ),
170 TP_fast_assign(
171 __entry->ctxt = ctxt;
172 __entry->subctxt = subctxt;
173 __entry->rarr = rarr;
174 __entry->npages = npages;
175 __entry->va = va;
176 __entry->pa = pa;
177 __entry->dma = dma;
178 ),
179 TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
180 __entry->ctxt,
181 __entry->subctxt,
182 __entry->rarr,
183 __entry->npages,
184 __entry->pa,
185 __entry->va,
186 __entry->dma
187 )
188 );
157 189
158TRACE_EVENT(hfi1_exp_tid_map, 190TRACE_EVENT(hfi1_exp_tid_unreg,
159 TP_PROTO(unsigned ctxt, u16 subctxt, int dir, 191 TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr, u32 npages,
160 unsigned long *maps, u16 count), 192 unsigned long va, unsigned long pa, dma_addr_t dma),
161 TP_ARGS(ctxt, subctxt, dir, maps, count), 193 TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
162 TP_STRUCT__entry( 194 TP_STRUCT__entry(
163 __field(unsigned, ctxt) 195 __field(unsigned, ctxt)
164 __field(u16, subctxt) 196 __field(u16, subctxt)
165 __field(int, dir) 197 __field(u32, rarr)
166 __field(u16, count) 198 __field(u32, npages)
167 __dynamic_array(unsigned long, maps, sizeof(*maps) * count) 199 __field(unsigned long, va)
200 __field(unsigned long, pa)
201 __field(dma_addr_t, dma)
168 ), 202 ),
169 TP_fast_assign( 203 TP_fast_assign(
170 __entry->ctxt = ctxt; 204 __entry->ctxt = ctxt;
171 __entry->subctxt = subctxt; 205 __entry->subctxt = subctxt;
172 __entry->dir = dir; 206 __entry->rarr = rarr;
173 __entry->count = count; 207 __entry->npages = npages;
174 memcpy(__get_dynamic_array(maps), maps, 208 __entry->va = va;
175 sizeof(*maps) * count); 209 __entry->pa = pa;
210 __entry->dma = dma;
176 ), 211 ),
177 TP_printk("[%3u:%02u] %s tidmaps %s", 212 TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
178 __entry->ctxt, 213 __entry->ctxt,
179 __entry->subctxt, 214 __entry->subctxt,
180 (__entry->dir ? ">" : "<"), 215 __entry->rarr,
181 print_u64_array(p, __get_dynamic_array(maps), 216 __entry->npages,
182 __entry->count) 217 __entry->pa,
218 __entry->va,
219 __entry->dma
183 ) 220 )
184 ); 221 );
185 222
186TRACE_EVENT(hfi1_exp_rcv_set, 223TRACE_EVENT(hfi1_exp_tid_inval,
187 TP_PROTO(unsigned ctxt, u16 subctxt, u32 tid, 224 TP_PROTO(unsigned ctxt, u16 subctxt, unsigned long va, u32 rarr,
188 unsigned long vaddr, u64 phys_addr, void *page), 225 u32 npages, dma_addr_t dma),
189 TP_ARGS(ctxt, subctxt, tid, vaddr, phys_addr, page), 226 TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
190 TP_STRUCT__entry( 227 TP_STRUCT__entry(
191 __field(unsigned, ctxt) 228 __field(unsigned, ctxt)
192 __field(u16, subctxt) 229 __field(u16, subctxt)
193 __field(u32, tid) 230 __field(unsigned long, va)
194 __field(unsigned long, vaddr) 231 __field(u32, rarr)
195 __field(u64, phys_addr) 232 __field(u32, npages)
196 __field(void *, page) 233 __field(dma_addr_t, dma)
197 ), 234 ),
198 TP_fast_assign( 235 TP_fast_assign(
199 __entry->ctxt = ctxt; 236 __entry->ctxt = ctxt;
200 __entry->subctxt = subctxt; 237 __entry->subctxt = subctxt;
201 __entry->tid = tid; 238 __entry->va = va;
202 __entry->vaddr = vaddr; 239 __entry->rarr = rarr;
203 __entry->phys_addr = phys_addr; 240 __entry->npages = npages;
204 __entry->page = page; 241 __entry->dma = dma;
205 ), 242 ),
206 TP_printk("[%u:%u] TID %u, vaddrs 0x%lx, physaddr 0x%llx, pgp %p", 243 TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
207 __entry->ctxt, 244 __entry->ctxt,
208 __entry->subctxt, 245 __entry->subctxt,
209 __entry->tid, 246 __entry->rarr,
210 __entry->vaddr, 247 __entry->npages,
211 __entry->phys_addr, 248 __entry->va,
212 __entry->page 249 __entry->dma
213 ) 250 )
214 ); 251 );
215 252
216TRACE_EVENT(hfi1_exp_rcv_free, 253TRACE_EVENT(hfi1_mmu_invalidate,
217 TP_PROTO(unsigned ctxt, u16 subctxt, u32 tid, 254 TP_PROTO(unsigned ctxt, u16 subctxt, const char *type,
218 unsigned long phys, void *page), 255 unsigned long start, unsigned long end),
219 TP_ARGS(ctxt, subctxt, tid, phys, page), 256 TP_ARGS(ctxt, subctxt, type, start, end),
220 TP_STRUCT__entry( 257 TP_STRUCT__entry(
221 __field(unsigned, ctxt) 258 __field(unsigned, ctxt)
222 __field(u16, subctxt) 259 __field(u16, subctxt)
223 __field(u32, tid) 260 __string(type, type)
224 __field(unsigned long, phys) 261 __field(unsigned long, start)
225 __field(void *, page) 262 __field(unsigned long, end)
226 ), 263 ),
227 TP_fast_assign( 264 TP_fast_assign(
228 __entry->ctxt = ctxt; 265 __entry->ctxt = ctxt;
229 __entry->subctxt = subctxt; 266 __entry->subctxt = subctxt;
230 __entry->tid = tid; 267 __assign_str(type, type);
231 __entry->phys = phys; 268 __entry->start = start;
232 __entry->page = page; 269 __entry->end = end;
233 ), 270 ),
234 TP_printk("[%u:%u] freeing TID %u, 0x%lx, pgp %p", 271 TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx",
235 __entry->ctxt, 272 __entry->ctxt,
236 __entry->subctxt, 273 __entry->subctxt,
237 __entry->tid, 274 __get_str(type),
238 __entry->phys, 275 __entry->start,
239 __entry->page 276 __entry->end
240 ) 277 )
241 ); 278 );
279
242#undef TRACE_SYSTEM 280#undef TRACE_SYSTEM
243#define TRACE_SYSTEM hfi1_tx 281#define TRACE_SYSTEM hfi1_tx
244 282
diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c
index d33f579675b7..79612a2bd07d 100644
--- a/drivers/staging/rdma/hfi1/user_exp_rcv.c
+++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c
@@ -902,6 +902,8 @@ static int set_rcvarray_entry(struct file *fp, unsigned long vaddr,
902 return -EFAULT; 902 return -EFAULT;
903 } 903 }
904 hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1); 904 hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
905 trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry,
906 npages, node->virt, node->phys, phys);
905 return 0; 907 return 0;
906} 908}
907 909
@@ -947,6 +949,10 @@ static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt,
947 struct hfi1_ctxtdata *uctxt = fd->uctxt; 949 struct hfi1_ctxtdata *uctxt = fd->uctxt;
948 struct hfi1_devdata *dd = uctxt->dd; 950 struct hfi1_devdata *dd = uctxt->dd;
949 951
952 trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
953 node->npages, node->virt, node->phys,
954 node->dma_addr);
955
950 hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0); 956 hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0);
951 /* 957 /*
952 * Make sure device has seen the write before we unpin the 958 * Make sure device has seen the write before we unpin the
@@ -1023,6 +1029,9 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
1023 struct mmu_rb_node *node; 1029 struct mmu_rb_node *node;
1024 unsigned long addr = start; 1030 unsigned long addr = start;
1025 1031
1032 trace_hfi1_mmu_invalidate(uctxt->ctxt, fd->subctxt, mmu_types[type],
1033 start, end);
1034
1026 spin_lock(&fd->rb_lock); 1035 spin_lock(&fd->rb_lock);
1027 while (addr < end) { 1036 while (addr < end) {
1028 node = mmu_rb_search_by_addr(root, addr); 1037 node = mmu_rb_search_by_addr(root, addr);
@@ -1049,6 +1058,9 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
1049 if (node->freed) 1058 if (node->freed)
1050 continue; 1059 continue;
1051 1060
1061 trace_hfi1_exp_tid_inval(uctxt->ctxt, fd->subctxt, node->virt,
1062 node->rcventry, node->npages,
1063 node->dma_addr);
1052 node->freed = true; 1064 node->freed = true;
1053 1065
1054 spin_lock(&fd->invalid_lock); 1066 spin_lock(&fd->invalid_lock);
diff --git a/drivers/staging/rdma/hfi1/user_pages.c b/drivers/staging/rdma/hfi1/user_pages.c
index 692de658f0dc..1854c0c7ce7e 100644
--- a/drivers/staging/rdma/hfi1/user_pages.c
+++ b/drivers/staging/rdma/hfi1/user_pages.c
@@ -54,20 +54,6 @@
54 54
55#include "hfi.h" 55#include "hfi.h"
56 56
57/**
58 * hfi1_map_page - a safety wrapper around pci_map_page()
59 *
60 */
61dma_addr_t hfi1_map_page(struct pci_dev *hwdev, struct page *page,
62 unsigned long offset, size_t size, int direction)
63{
64 dma_addr_t phys;
65
66 phys = pci_map_page(hwdev, page, offset, size, direction);
67
68 return phys;
69}
70
71int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable, 57int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable,
72 struct page **pages) 58 struct page **pages)
73{ 59{
diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h
index 92be2e373019..a533cecab14f 100644
--- a/include/uapi/rdma/hfi/hfi1_user.h
+++ b/include/uapi/rdma/hfi/hfi1_user.h
@@ -66,7 +66,7 @@
66 * The major version changes when data structures change in an incompatible 66 * The major version changes when data structures change in an incompatible
67 * way. The driver must be the same for initialization to succeed. 67 * way. The driver must be the same for initialization to succeed.
68 */ 68 */
69#define HFI1_USER_SWMAJOR 4 69#define HFI1_USER_SWMAJOR 5
70 70
71/* 71/*
72 * Minor version differences are always compatible 72 * Minor version differences are always compatible
@@ -241,11 +241,6 @@ struct hfi1_tid_info {
241 __u32 tidcnt; 241 __u32 tidcnt;
242 /* length of transfer buffer programmed by this request */ 242 /* length of transfer buffer programmed by this request */
243 __u32 length; 243 __u32 length;
244 /*
245 * pointer to bitmap of TIDs used for this call;
246 * checked for being large enough at open
247 */
248 __u64 tidmap;
249}; 244};
250 245
251struct hfi1_cmd { 246struct hfi1_cmd {